Compare commits
5 Commits
main
...
feat/custo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d649710627 | ||
|
|
498c15555e | ||
|
|
25345a909a | ||
|
|
bfd8321b42 | ||
|
|
fbbed439bd |
@ -9,6 +9,7 @@ Docs: https://docs.clawd.bot
|
||||
- Venius (Venice AI): highlight provider guide + cross-links + expanded guidance. https://docs.clawd.bot/providers/venice
|
||||
|
||||
### Changes
|
||||
- TTS: allow custom OpenAI-compatible TTS endpoints without API keys. (#1701) Thanks @louzhixian. https://docs.clawd.bot/tts
|
||||
- TTS: add Edge TTS provider fallback, defaulting to keyless Edge with MP3 retry on format failures. (#1668) Thanks @steipete. https://docs.clawd.bot/tts
|
||||
- Web search: add Brave freshness filter parameter for time-scoped results. (#1688) Thanks @JonUleis. https://docs.clawd.bot/tools/web
|
||||
- TTS: add auto mode enum (off/always/inbound/tagged) with per-session `/tts` override. (#1667) Thanks @sebslight. https://docs.clawd.bot/tts
|
||||
|
||||
@ -1538,6 +1538,7 @@ voice notes; other channels send MP3 audio.
|
||||
},
|
||||
openai: {
|
||||
apiKey: "openai_api_key",
|
||||
baseUrl: "https://api.openai.com/v1",
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "alloy"
|
||||
}
|
||||
@ -1558,6 +1559,8 @@ Notes:
|
||||
- `/tts limit` and `/tts summary` control per-user summarization settings.
|
||||
- `apiKey` values fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY` and `OPENAI_API_KEY`.
|
||||
- `elevenlabs.baseUrl` overrides the ElevenLabs API base URL.
|
||||
- `openai.baseUrl` overrides the OpenAI TTS base URL (defaults to `https://api.openai.com/v1`,
|
||||
or `OPENAI_TTS_BASE_URL` when set). Custom endpoints do not require an API key.
|
||||
- `elevenlabs.voiceSettings` supports `stability`/`similarityBoost`/`style` (0..1),
|
||||
`useSpeakerBoost`, and `speed` (0.5..2.0).
|
||||
|
||||
|
||||
35
docs/tts.md
35
docs/tts.md
@ -33,11 +33,18 @@ does not publish limits, so assume similar or lower limits. citeturn0searc
|
||||
|
||||
If you want OpenAI or ElevenLabs:
|
||||
- `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
|
||||
- `OPENAI_API_KEY`
|
||||
- `OPENAI_API_KEY` (optional for custom OpenAI-compatible endpoints)
|
||||
|
||||
Optional:
|
||||
- `OPENAI_TTS_BASE_URL` (defaults to `https://api.openai.com/v1`)
|
||||
|
||||
Edge TTS does **not** require an API key. If no API keys are found, Clawdbot defaults
|
||||
to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`).
|
||||
|
||||
When `OPENAI_TTS_BASE_URL` points to a non-OpenAI endpoint, Clawdbot relaxes
|
||||
OpenAI model/voice validation and does **not** require an API key. If your
|
||||
endpoint needs authentication, set `messages.tts.openai.apiKey` or `OPENAI_API_KEY`.
|
||||
|
||||
If multiple providers are configured, the selected provider is used first and the others are fallback options.
|
||||
Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`),
|
||||
so that provider must also be authenticated if you enable summaries.
|
||||
@ -115,6 +122,32 @@ Full schema is in [Gateway configuration](/gateway/configuration).
|
||||
}
|
||||
```
|
||||
|
||||
### Custom OpenAI-compatible endpoint
|
||||
|
||||
Set an endpoint in the environment:
|
||||
|
||||
```bash
|
||||
export OPENAI_TTS_BASE_URL="http://localhost:8880/v1"
|
||||
```
|
||||
|
||||
Then configure TTS as usual (API key optional for custom endpoints):
|
||||
|
||||
```json5
|
||||
{
|
||||
messages: {
|
||||
tts: {
|
||||
auto: "always",
|
||||
provider: "openai",
|
||||
openai: {
|
||||
baseUrl: "http://localhost:8880/v1",
|
||||
model: "kokoro",
|
||||
voice: "zm_yunxia"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Edge TTS primary (no API key)
|
||||
|
||||
```json5
|
||||
|
||||
@ -56,6 +56,7 @@ export type TtsConfig = {
|
||||
/** OpenAI configuration. */
|
||||
openai?: {
|
||||
apiKey?: string;
|
||||
baseUrl?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
};
|
||||
|
||||
@ -204,6 +204,7 @@ export const TtsConfigSchema = z
|
||||
openai: z
|
||||
.object({
|
||||
apiKey: z.string().optional(),
|
||||
baseUrl: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
})
|
||||
|
||||
@ -131,7 +131,7 @@ export const ttsHandlers: GatewayRequestHandlers = {
|
||||
{
|
||||
id: "openai",
|
||||
name: "OpenAI",
|
||||
configured: Boolean(resolveTtsApiKey(config, "openai")),
|
||||
configured: isTtsProviderConfigured(config, "openai"),
|
||||
models: [...OPENAI_TTS_MODELS],
|
||||
voices: [...OPENAI_TTS_VOICES],
|
||||
},
|
||||
|
||||
@ -106,26 +106,36 @@ describe("tts", () => {
|
||||
expect(isValidOpenAIVoice("alloy ")).toBe(false);
|
||||
expect(isValidOpenAIVoice(" alloy")).toBe(false);
|
||||
});
|
||||
|
||||
it("accepts custom voices for custom endpoints", () => {
|
||||
expect(isValidOpenAIVoice("zm_yunxia", { baseUrl: "http://localhost:8880/v1" })).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isValidOpenAIModel", () => {
|
||||
it("accepts gpt-4o-mini-tts model", () => {
|
||||
it("accepts supported models", () => {
|
||||
expect(isValidOpenAIModel("gpt-4o-mini-tts")).toBe(true);
|
||||
expect(isValidOpenAIModel("tts-1")).toBe(true);
|
||||
expect(isValidOpenAIModel("tts-1-hd")).toBe(true);
|
||||
});
|
||||
|
||||
it("rejects other models", () => {
|
||||
expect(isValidOpenAIModel("tts-1")).toBe(false);
|
||||
expect(isValidOpenAIModel("tts-1-hd")).toBe(false);
|
||||
expect(isValidOpenAIModel("invalid")).toBe(false);
|
||||
expect(isValidOpenAIModel("")).toBe(false);
|
||||
expect(isValidOpenAIModel("gpt-4")).toBe(false);
|
||||
});
|
||||
|
||||
it("accepts custom models for custom endpoints", () => {
|
||||
expect(isValidOpenAIModel("kokoro", { baseUrl: "http://localhost:8880/v1" })).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("OPENAI_TTS_MODELS", () => {
|
||||
it("contains only gpt-4o-mini-tts", () => {
|
||||
it("contains supported models", () => {
|
||||
expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts");
|
||||
expect(OPENAI_TTS_MODELS).toHaveLength(1);
|
||||
expect(OPENAI_TTS_MODELS).toContain("tts-1");
|
||||
expect(OPENAI_TTS_MODELS).toContain("tts-1-hd");
|
||||
expect(OPENAI_TTS_MODELS).toHaveLength(3);
|
||||
});
|
||||
|
||||
it("is a non-empty array", () => {
|
||||
@ -200,6 +210,30 @@ describe("tts", () => {
|
||||
expect(result.overrides.provider).toBe("edge");
|
||||
});
|
||||
|
||||
it("routes custom OpenAI models when custom endpoints are used", () => {
|
||||
const policy = resolveModelOverridePolicy({ enabled: true });
|
||||
const input = "Hello [[tts:provider=openai model=kokoro voice=zm_yunxia]] world";
|
||||
const result = parseTtsDirectives(input, policy, {
|
||||
defaultProvider: "openai",
|
||||
openaiBaseUrl: "http://localhost:8880/v1",
|
||||
});
|
||||
|
||||
expect(result.overrides.openai?.model).toBe("kokoro");
|
||||
expect(result.overrides.openai?.voice).toBe("zm_yunxia");
|
||||
});
|
||||
|
||||
it("routes model overrides to ElevenLabs when provider is ElevenLabs", () => {
|
||||
const policy = resolveModelOverridePolicy({ enabled: true });
|
||||
const input = "Hello [[tts:provider=elevenlabs model=eleven_multilingual_v2]] world";
|
||||
const result = parseTtsDirectives(input, policy, {
|
||||
defaultProvider: "openai",
|
||||
openaiBaseUrl: "http://localhost:8880/v1",
|
||||
});
|
||||
|
||||
expect(result.overrides.elevenlabs?.modelId).toBe("eleven_multilingual_v2");
|
||||
expect(result.overrides.openai?.model).toBeUndefined();
|
||||
});
|
||||
|
||||
it("keeps text intact when overrides are disabled", () => {
|
||||
const policy = resolveModelOverridePolicy({ enabled: false });
|
||||
const input = "Hello [[tts:voice=alloy]] world";
|
||||
@ -357,7 +391,12 @@ describe("tts", () => {
|
||||
};
|
||||
|
||||
const restoreEnv = (snapshot: Record<string, string | undefined>) => {
|
||||
const keys = ["OPENAI_API_KEY", "ELEVENLABS_API_KEY", "XI_API_KEY"] as const;
|
||||
const keys = [
|
||||
"OPENAI_API_KEY",
|
||||
"OPENAI_TTS_BASE_URL",
|
||||
"ELEVENLABS_API_KEY",
|
||||
"XI_API_KEY",
|
||||
] as const;
|
||||
for (const key of keys) {
|
||||
const value = snapshot[key];
|
||||
if (value === undefined) {
|
||||
@ -371,6 +410,7 @@ describe("tts", () => {
|
||||
const withEnv = (env: Record<string, string | undefined>, run: () => void) => {
|
||||
const snapshot = {
|
||||
OPENAI_API_KEY: process.env.OPENAI_API_KEY,
|
||||
OPENAI_TTS_BASE_URL: process.env.OPENAI_TTS_BASE_URL,
|
||||
ELEVENLABS_API_KEY: process.env.ELEVENLABS_API_KEY,
|
||||
XI_API_KEY: process.env.XI_API_KEY,
|
||||
};
|
||||
@ -392,6 +432,7 @@ describe("tts", () => {
|
||||
withEnv(
|
||||
{
|
||||
OPENAI_API_KEY: "test-openai-key",
|
||||
OPENAI_TTS_BASE_URL: undefined,
|
||||
ELEVENLABS_API_KEY: undefined,
|
||||
XI_API_KEY: undefined,
|
||||
},
|
||||
@ -407,6 +448,7 @@ describe("tts", () => {
|
||||
withEnv(
|
||||
{
|
||||
OPENAI_API_KEY: undefined,
|
||||
OPENAI_TTS_BASE_URL: undefined,
|
||||
ELEVENLABS_API_KEY: "test-elevenlabs-key",
|
||||
XI_API_KEY: undefined,
|
||||
},
|
||||
@ -422,6 +464,7 @@ describe("tts", () => {
|
||||
withEnv(
|
||||
{
|
||||
OPENAI_API_KEY: undefined,
|
||||
OPENAI_TTS_BASE_URL: undefined,
|
||||
ELEVENLABS_API_KEY: undefined,
|
||||
XI_API_KEY: undefined,
|
||||
},
|
||||
@ -432,6 +475,22 @@ describe("tts", () => {
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it("prefers OpenAI when a custom endpoint is configured without API keys", () => {
|
||||
withEnv(
|
||||
{
|
||||
OPENAI_API_KEY: undefined,
|
||||
OPENAI_TTS_BASE_URL: "http://localhost:8880/v1",
|
||||
ELEVENLABS_API_KEY: undefined,
|
||||
XI_API_KEY: undefined,
|
||||
},
|
||||
() => {
|
||||
const config = resolveTtsConfig(baseCfg);
|
||||
const provider = getTtsProvider(config, "/tmp/tts-prefs-openai-custom.json");
|
||||
expect(provider).toBe("openai");
|
||||
},
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("maybeApplyTtsToPayload", () => {
|
||||
@ -495,6 +554,55 @@ describe("tts", () => {
|
||||
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
|
||||
it("allows custom OpenAI endpoints without API keys", async () => {
|
||||
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||
const prevBaseUrl = process.env.OPENAI_TTS_BASE_URL;
|
||||
const prevOpenAiKey = process.env.OPENAI_API_KEY;
|
||||
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
process.env.OPENAI_TTS_BASE_URL = "http://localhost:8880/v1";
|
||||
delete process.env.OPENAI_API_KEY;
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchMock = vi.fn(async () => ({
|
||||
ok: true,
|
||||
arrayBuffer: async () => new ArrayBuffer(1),
|
||||
}));
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
|
||||
const cfg = {
|
||||
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
|
||||
messages: {
|
||||
tts: {
|
||||
auto: "always",
|
||||
provider: "openai",
|
||||
openai: { model: "kokoro", voice: "zm_yunxia" },
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await maybeApplyTtsToPayload({
|
||||
payload: { text: "Hello world" },
|
||||
cfg,
|
||||
kind: "final",
|
||||
});
|
||||
|
||||
expect(result.mediaUrl).toBeDefined();
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
const [url, options] = fetchMock.mock.calls[0] ?? [];
|
||||
expect(url).toBe("http://localhost:8880/v1/audio/speech");
|
||||
expect(
|
||||
(options as { headers?: Record<string, string> })?.headers?.Authorization,
|
||||
).toBeUndefined();
|
||||
|
||||
globalThis.fetch = originalFetch;
|
||||
process.env.OPENAI_TTS_BASE_URL = prevBaseUrl;
|
||||
if (prevOpenAiKey === undefined) {
|
||||
delete process.env.OPENAI_API_KEY;
|
||||
} else {
|
||||
process.env.OPENAI_API_KEY = prevOpenAiKey;
|
||||
}
|
||||
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||
});
|
||||
|
||||
it("skips auto-TTS in tagged mode unless a tts tag is present", async () => {
|
||||
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||
|
||||
136
src/tts/tts.ts
136
src/tts/tts.ts
@ -48,6 +48,7 @@ const DEFAULT_ELEVENLABS_VOICE_ID = "pMsXgVXv3BLzUgSXRplE";
|
||||
const DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2";
|
||||
const DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts";
|
||||
const DEFAULT_OPENAI_VOICE = "alloy";
|
||||
const DEFAULT_OPENAI_TTS_BASE_URL = "https://api.openai.com/v1";
|
||||
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
|
||||
const DEFAULT_EDGE_LANG = "en-US";
|
||||
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
||||
@ -103,6 +104,7 @@ export type ResolvedTtsConfig = {
|
||||
};
|
||||
openai: {
|
||||
apiKey?: string;
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
};
|
||||
@ -234,6 +236,9 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
||||
const raw: TtsConfig = cfg.messages?.tts ?? {};
|
||||
const providerSource = raw.provider ? "config" : "default";
|
||||
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
|
||||
const openaiBaseUrl = normalizeOpenAiTtsBaseUrl(
|
||||
raw.openai?.baseUrl?.trim() || process.env.OPENAI_TTS_BASE_URL,
|
||||
);
|
||||
const auto = normalizeTtsAutoMode(raw.auto) ?? (raw.enabled ? "always" : "off");
|
||||
return {
|
||||
auto,
|
||||
@ -265,6 +270,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
||||
},
|
||||
openai: {
|
||||
apiKey: raw.openai?.apiKey,
|
||||
baseUrl: openaiBaseUrl,
|
||||
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
||||
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
||||
},
|
||||
@ -395,7 +401,9 @@ export function getTtsProvider(config: ResolvedTtsConfig, prefsPath: string): Tt
|
||||
if (prefs.tts?.provider) return prefs.tts.provider;
|
||||
if (config.providerSource === "config") return config.provider;
|
||||
|
||||
if (resolveTtsApiKey(config, "openai")) return "openai";
|
||||
if (resolveTtsApiKey(config, "openai") || isCustomOpenAiTtsEndpoint(config.openai.baseUrl)) {
|
||||
return "openai";
|
||||
}
|
||||
if (resolveTtsApiKey(config, "elevenlabs")) return "elevenlabs";
|
||||
return "edge";
|
||||
}
|
||||
@ -470,6 +478,12 @@ export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
|
||||
|
||||
export function isTtsProviderConfigured(config: ResolvedTtsConfig, provider: TtsProvider): boolean {
|
||||
if (provider === "edge") return config.edge.enabled;
|
||||
if (provider === "openai") {
|
||||
return (
|
||||
Boolean(resolveTtsApiKey(config, provider)) ||
|
||||
isCustomOpenAiTtsEndpoint(config.openai.baseUrl)
|
||||
);
|
||||
}
|
||||
return Boolean(resolveTtsApiKey(config, provider));
|
||||
}
|
||||
|
||||
@ -483,6 +497,16 @@ function normalizeElevenLabsBaseUrl(baseUrl: string): string {
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function normalizeOpenAiTtsBaseUrl(baseUrl?: string): string {
|
||||
const trimmed = baseUrl?.trim();
|
||||
if (!trimmed) return DEFAULT_OPENAI_TTS_BASE_URL;
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function isCustomOpenAiTtsEndpoint(baseUrl: string): boolean {
|
||||
return normalizeOpenAiTtsBaseUrl(baseUrl) !== DEFAULT_OPENAI_TTS_BASE_URL;
|
||||
}
|
||||
|
||||
function requireInRange(value: number, min: number, max: number, label: string): void {
|
||||
if (!Number.isFinite(value) || value < min || value > max) {
|
||||
throw new Error(`${label} must be between ${min} and ${max}`);
|
||||
@ -538,6 +562,7 @@ function parseNumberValue(value: string): number | undefined {
|
||||
function parseTtsDirectives(
|
||||
text: string,
|
||||
policy: ResolvedTtsModelOverrides,
|
||||
options: { defaultProvider?: TtsProvider; openaiBaseUrl?: string } = {},
|
||||
): TtsDirectiveParseResult {
|
||||
if (!policy.enabled) {
|
||||
return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false };
|
||||
@ -561,6 +586,30 @@ function parseTtsDirectives(
|
||||
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
|
||||
hasDirective = true;
|
||||
const tokens = body.split(/\s+/).filter(Boolean);
|
||||
let providerOverride: TtsProvider | undefined;
|
||||
|
||||
for (const token of tokens) {
|
||||
const eqIndex = token.indexOf("=");
|
||||
if (eqIndex === -1) continue;
|
||||
const rawKey = token.slice(0, eqIndex).trim();
|
||||
const rawValue = token.slice(eqIndex + 1).trim();
|
||||
if (!rawKey || !rawValue) continue;
|
||||
if (rawKey.toLowerCase() !== "provider") continue;
|
||||
if (!policy.allowProvider) break;
|
||||
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
|
||||
providerOverride = rawValue;
|
||||
} else {
|
||||
warnings.push(`unsupported provider "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (providerOverride) {
|
||||
overrides.provider = providerOverride;
|
||||
}
|
||||
|
||||
const providerHint = overrides.provider ?? options.defaultProvider;
|
||||
const openAiContext = { baseUrl: options.openaiBaseUrl };
|
||||
for (const token of tokens) {
|
||||
const eqIndex = token.indexOf("=");
|
||||
if (eqIndex === -1) continue;
|
||||
@ -571,18 +620,12 @@ function parseTtsDirectives(
|
||||
try {
|
||||
switch (key) {
|
||||
case "provider":
|
||||
if (!policy.allowProvider) break;
|
||||
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
|
||||
overrides.provider = rawValue;
|
||||
} else {
|
||||
warnings.push(`unsupported provider "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
case "voice":
|
||||
case "openai_voice":
|
||||
case "openaivoice":
|
||||
if (!policy.allowVoice) break;
|
||||
if (isValidOpenAIVoice(rawValue)) {
|
||||
if (isValidOpenAIVoice(rawValue, openAiContext)) {
|
||||
overrides.openai = { ...overrides.openai, voice: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid OpenAI voice "${rawValue}"`);
|
||||
@ -602,17 +645,40 @@ function parseTtsDirectives(
|
||||
case "model":
|
||||
case "modelid":
|
||||
case "model_id":
|
||||
case "elevenlabs_model":
|
||||
case "elevenlabsmodel":
|
||||
case "openai_model":
|
||||
case "openaimodel":
|
||||
if (!policy.allowModelId) break;
|
||||
if (isValidOpenAIModel(rawValue)) {
|
||||
if (key === "openai_model" || key === "openaimodel") {
|
||||
if (isValidOpenAIModel(rawValue, openAiContext)) {
|
||||
overrides.openai = { ...overrides.openai, model: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid OpenAI model "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (providerHint === "openai") {
|
||||
if (isValidOpenAIModel(rawValue, openAiContext)) {
|
||||
overrides.openai = { ...overrides.openai, model: rawValue };
|
||||
} else {
|
||||
warnings.push(`invalid OpenAI model "${rawValue}"`);
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (providerHint === "elevenlabs") {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
|
||||
break;
|
||||
}
|
||||
if (isValidOpenAIModel(rawValue, openAiContext)) {
|
||||
overrides.openai = { ...overrides.openai, model: rawValue };
|
||||
} else {
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
|
||||
}
|
||||
break;
|
||||
case "elevenlabs_model":
|
||||
case "elevenlabsmodel":
|
||||
if (!policy.allowModelId) break;
|
||||
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
|
||||
break;
|
||||
case "stability":
|
||||
if (!policy.allowVoiceSettings) break;
|
||||
{
|
||||
@ -736,7 +802,7 @@ function parseTtsDirectives(
|
||||
};
|
||||
}
|
||||
|
||||
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] as const;
|
||||
export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
|
||||
export const OPENAI_TTS_VOICES = [
|
||||
"alloy",
|
||||
"ash",
|
||||
@ -750,12 +816,15 @@ export const OPENAI_TTS_VOICES = [
|
||||
] as const;
|
||||
|
||||
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
|
||||
type OpenAiTtsValidationContext = { baseUrl?: string };
|
||||
|
||||
function isValidOpenAIModel(model: string): boolean {
|
||||
function isValidOpenAIModel(model: string, context: OpenAiTtsValidationContext = {}): boolean {
|
||||
if (context.baseUrl && isCustomOpenAiTtsEndpoint(context.baseUrl)) return true;
|
||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||
}
|
||||
|
||||
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
|
||||
function isValidOpenAIVoice(voice: string, context: OpenAiTtsValidationContext = {}): boolean {
|
||||
if (context.baseUrl && isCustomOpenAiTtsEndpoint(context.baseUrl)) return true;
|
||||
return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
|
||||
}
|
||||
|
||||
@ -963,18 +1032,21 @@ async function elevenLabsTTS(params: {
|
||||
|
||||
async function openaiTTS(params: {
|
||||
text: string;
|
||||
apiKey: string;
|
||||
apiKey?: string;
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
responseFormat: "mp3" | "opus";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, model, voice, responseFormat, timeoutMs } = params;
|
||||
const { text, model, voice, responseFormat, timeoutMs } = params;
|
||||
const apiKey = params.apiKey?.trim();
|
||||
const baseUrl = normalizeOpenAiTtsBaseUrl(params.baseUrl);
|
||||
|
||||
if (!isValidOpenAIModel(model)) {
|
||||
if (!isValidOpenAIModel(model, { baseUrl })) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
}
|
||||
if (!isValidOpenAIVoice(voice)) {
|
||||
if (!isValidOpenAIVoice(voice, { baseUrl })) {
|
||||
throw new Error(`Invalid voice: ${voice}`);
|
||||
}
|
||||
|
||||
@ -982,12 +1054,15 @@ async function openaiTTS(params: {
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
||||
const headers: Record<string, string> = {
|
||||
"Content-Type": "application/json",
|
||||
};
|
||||
if (apiKey) {
|
||||
headers.Authorization = `Bearer ${apiKey}`;
|
||||
}
|
||||
const response = await fetch(`${baseUrl}/audio/speech`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
headers,
|
||||
body: JSON.stringify({
|
||||
model,
|
||||
input: text,
|
||||
@ -1137,13 +1212,19 @@ export async function textToSpeech(params: {
|
||||
}
|
||||
|
||||
const apiKey = resolveTtsApiKey(config, provider);
|
||||
if (!apiKey) {
|
||||
const allowMissingKey =
|
||||
provider === "openai" && isCustomOpenAiTtsEndpoint(config.openai.baseUrl);
|
||||
if (!apiKey && !allowMissingKey) {
|
||||
lastError = `No API key for ${provider}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
let audioBuffer: Buffer;
|
||||
if (provider === "elevenlabs") {
|
||||
if (!apiKey) {
|
||||
lastError = "No API key for elevenlabs";
|
||||
continue;
|
||||
}
|
||||
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
|
||||
const modelIdOverride = params.overrides?.elevenlabs?.modelId;
|
||||
const voiceSettings = {
|
||||
@ -1172,6 +1253,7 @@ export async function textToSpeech(params: {
|
||||
audioBuffer = await openaiTTS({
|
||||
text: params.text,
|
||||
apiKey,
|
||||
baseUrl: config.openai.baseUrl,
|
||||
model: openaiModelOverride ?? config.openai.model,
|
||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||
responseFormat: output.openai,
|
||||
@ -1227,8 +1309,12 @@ export async function maybeApplyTtsToPayload(params: {
|
||||
});
|
||||
if (autoMode === "off") return params.payload;
|
||||
|
||||
const defaultProvider = getTtsProvider(config, prefsPath);
|
||||
const text = params.payload.text ?? "";
|
||||
const directives = parseTtsDirectives(text, config.modelOverrides);
|
||||
const directives = parseTtsDirectives(text, config.modelOverrides, {
|
||||
defaultProvider,
|
||||
openaiBaseUrl: config.openai.baseUrl,
|
||||
});
|
||||
if (directives.warnings.length > 0) {
|
||||
logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`);
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user