feat(tts): add Gemini TTS provider

This commit is contained in:
Shady Khalifa 2026-01-25 02:57:03 +02:00
parent 6859e1e6a6
commit a58bcae244
No known key found for this signature in database
GPG Key ID: 52DFAC81BEA54EAA
8 changed files with 452 additions and 31 deletions

View File

@ -1503,8 +1503,9 @@ active agents `identity.emoji` when set, otherwise `"👀"`. Set it to `""` t
#### `messages.tts`
Enable text-to-speech for outbound replies. When on, Clawdbot generates audio
using ElevenLabs or OpenAI and attaches it to responses. Telegram uses Opus
voice notes; other channels send MP3 audio.
using ElevenLabs, OpenAI, Edge TTS, or Gemini and attaches it to responses.
Telegram uses Opus voice notes; other channels send MP3 audio. Edge TTS does not
require an API key and is used when no other provider is configured.
```json5
{
@ -1540,6 +1541,17 @@ voice notes; other channels send MP3 audio.
apiKey: "openai_api_key",
model: "gpt-4o-mini-tts",
voice: "alloy"
},
edge: {
enabled: true,
voice: "en-US-MichelleNeural",
lang: "en-US"
},
gemini: {
apiKey: "gemini_api_key",
model: "gemini-2.5-flash-preview-tts",
voiceName: "Kore",
baseUrl: "generativelanguage.googleapis.com"
}
}
}
@ -1555,11 +1567,18 @@ Notes:
- `summaryModel` overrides `agents.defaults.model.primary` for auto-summary.
- Accepts `provider/model` or an alias from `agents.defaults.models`.
- `modelOverrides` enables model-driven overrides like `[[tts:...]]` tags (on by default).
- `provider`: `"elevenlabs"`, `"openai"`, `"edge"`, or `"gemini"`.
- If `provider` is unset, Clawdbot picks OpenAI when configured, then ElevenLabs, then Edge.
- Gemini falls back to ElevenLabs, then OpenAI, then Edge (if enabled).
- `/tts limit` and `/tts summary` control per-user summarization settings.
- `apiKey` values fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY` and `OPENAI_API_KEY`.
- `apiKey` values fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`, and `GEMINI_API_KEY`.
- `elevenlabs.baseUrl` overrides the ElevenLabs API base URL.
- `elevenlabs.voiceSettings` supports `stability`/`similarityBoost`/`style` (0..1),
`useSpeakerBoost`, and `speed` (0.5..2.0).
- `edge.enabled` toggles Edge TTS (no API key required).
- `edge.outputFormat` sets the Edge TTS output format; defaults to MP3 if omitted.
- `gemini.baseUrl` adds `/v1beta` if missing.
- Gemini TTS requires `ffmpeg` to transcode PCM into MP3/Opus.
### `talk`

View File

@ -8,7 +8,7 @@ read_when:
# Text-to-speech (TTS)
Clawdbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS.
Clawdbot can convert outbound replies into audio using ElevenLabs, OpenAI, Edge TTS, or Gemini.
It works anywhere Clawdbot can send audio; Telegram gets a round voice-note bubble.
## Supported services
@ -16,29 +16,32 @@ It works anywhere Clawdbot can send audio; Telegram gets a round voice-note bubb
- **ElevenLabs** (primary or fallback provider)
- **OpenAI** (primary or fallback provider; also used for summaries)
- **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys)
- **Gemini** (primary provider with fallback to ElevenLabs, then OpenAI)
### Edge TTS notes
Edge TTS uses Microsoft Edge's online neural TTS service via the `node-edge-tts`
library. It's a hosted service (not local), uses Microsofts endpoints, and does
not require an API key. `node-edge-tts` exposes speech configuration options and
output formats, but not all options are supported by the Edge service. citeturn2search0
output formats, but not all options are supported by the Edge service.
Because Edge TTS is a public web service without a published SLA or quota, treat it
as best-effort. If you need guaranteed limits and support, use OpenAI or ElevenLabs.
Microsoft's Speech REST API documents a 10minute audio limit per request; Edge TTS
does not publish limits, so assume similar or lower limits. citeturn0search3
does not publish limits, so assume similar or lower limits.
## Optional keys
If you want OpenAI or ElevenLabs:
If you want OpenAI, ElevenLabs, or Gemini:
- `ELEVENLABS_API_KEY` (or `XI_API_KEY`)
- `OPENAI_API_KEY`
- `GEMINI_API_KEY`
Edge TTS does **not** require an API key. If no API keys are found, Clawdbot defaults
to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`).
If multiple providers are configured, the selected provider is used first and the others are fallback options.
Gemini falls back to ElevenLabs, then OpenAI, then Edge (if enabled).
Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`),
so that provider must also be authenticated if you enable summaries.
@ -50,6 +53,9 @@ so that provider must also be authenticated if you enable summaries.
- [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication)
- [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts)
- [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs)
- [Gemini speech generation](https://ai.google.dev/gemini-api/docs/speech-generation)
- [Gemini TTS voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices)
- [Gemini TTS models](https://ai.google.dev/gemini-api/docs/speech-generation#supported-models)
## Is it enabled by default?
@ -136,6 +142,25 @@ Full schema is in [Gateway configuration](/gateway/configuration).
}
```
### Gemini primary
```json5
{
messages: {
tts: {
enabled: true,
provider: "gemini",
gemini: {
apiKey: "gemini_api_key",
model: "gemini-2.5-flash-preview-tts",
voiceName: "Kore",
baseUrl: "generativelanguage.googleapis.com"
}
}
}
}
```
### Disable Edge TTS
```json5
@ -202,17 +227,20 @@ Then run:
- `tagged` only sends audio when the reply includes `[[tts]]` tags.
- `enabled`: legacy toggle (doctor migrates this to `auto`).
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
- If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key),
otherwise `edge`.
- `provider`: `"elevenlabs"`, `"openai"`, `"edge"`, or `"gemini"`.
- If `provider` is **unset**, Clawdbot picks OpenAI when configured, then ElevenLabs, then Edge.
- Gemini falls back to ElevenLabs, then OpenAI, then Edge (if enabled).
- `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`.
- Accepts `provider/model` or a configured model alias.
- `modelOverrides`: allow the model to emit TTS directives (on by default).
- `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded.
- `timeoutMs`: request timeout (ms).
- `prefsPath`: override the local prefs JSON path (provider/limit/summary).
- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`).
- `prefsPath`: override the local prefs JSON path.
- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`, `GEMINI_API_KEY`).
- `elevenlabs.baseUrl`: override ElevenLabs API base URL.
- `gemini.baseUrl`: base URL for the Gemini API; if it doesn't end in `/v1beta`, Clawdbot appends it.
- `gemini.voiceName`: prebuilt Gemini voice name (free-form; see Gemini docs for the list).
- `gemini.model`: Gemini TTS model name (free-form).
- `elevenlabs.voiceSettings`:
- `stability`, `similarityBoost`, `style`: `0..1`
- `useSpeakerBoost`: `true|false`
@ -250,9 +278,9 @@ Here you go.
```
Available directive keys (when enabled):
- `provider` (`openai` | `elevenlabs` | `edge`)
- `voice` (OpenAI voice) or `voiceId` (ElevenLabs)
- `model` (OpenAI TTS model or ElevenLabs model id)
- `provider` (`openai` | `elevenlabs` | `edge` | `gemini`)
- `voice` (OpenAI voice), `voiceId` (ElevenLabs), or `voiceName` (Gemini)
- `model` (OpenAI TTS model, ElevenLabs model id, or Gemini model name)
- `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost`
- `applyTextNormalization` (`auto|on|off`)
- `languageCode` (ISO 639-1)
@ -304,18 +332,20 @@ These override `messages.tts.*` for that host.
## Output formats (fixed)
- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI).
- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI/Gemini).
- 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble.
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI).
- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI/Gemini).
- 44.1kHz / 128kbps is the default balance for speech clarity.
- **Edge TTS**: uses `edge.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
- `node-edge-tts` accepts an `outputFormat`, but not all formats are available
from the Edge service. citeturn2search0
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). citeturn1search0
from the Edge service.
- Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus).
- Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need
guaranteed Opus voice notes. citeturn1search1
guaranteed Opus voice notes.
- If the configured Edge output format fails, Clawdbot retries with MP3.
Gemini returns raw PCM audio and requires `ffmpeg` to transcode into MP3/Opus.
OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX.
## Auto-TTS behavior
@ -359,7 +389,7 @@ Discord note: `/tts` is a built-in Discord command, so Clawdbot registers
/tts inbound
/tts tagged
/tts status
/tts provider openai
/tts provider gemini
/tts limit 2000
/tts summary off
/tts audio Hello from Clawdbot

View File

@ -45,6 +45,7 @@ function ttsUsage(): ReplyPayload {
"/tts always\n" +
"/tts provider openai\n" +
"/tts provider edge\n" +
"/tts provider gemini\n" +
"/tts limit 2000\n" +
"/tts summary off\n" +
"/tts audio Hello from Clawdbot",
@ -152,6 +153,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai"));
const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs"));
const hasEdge = isTtsProviderConfigured(config, "edge");
const hasGemini = Boolean(resolveTtsApiKey(config, "gemini"));
return {
shouldContinue: false,
reply: {
@ -162,13 +164,19 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand
`OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` +
`ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` +
`Edge enabled: ${hasEdge ? "✅" : "❌"}\n` +
`Usage: /tts provider openai | elevenlabs | edge`,
`Gemini key: ${hasGemini ? "✅" : "❌"}\n` +
`Usage: /tts provider openai | elevenlabs | edge | gemini`,
},
};
}
const requested = args.trim().toLowerCase();
if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") {
if (
requested !== "openai" &&
requested !== "elevenlabs" &&
requested !== "edge" &&
requested !== "gemini"
) {
return { shouldContinue: false, reply: ttsUsage() };
}

View File

@ -1,4 +1,4 @@
export type TtsProvider = "elevenlabs" | "openai" | "edge";
export type TtsProvider = "elevenlabs" | "openai" | "edge" | "gemini";
export type TtsMode = "final" | "all";
@ -73,6 +73,13 @@ export type TtsConfig = {
proxy?: string;
timeoutMs?: number;
};
/** Gemini configuration. */
gemini?: {
apiKey?: string;
model?: string;
voiceName?: string;
baseUrl?: string;
};
/** Optional path for local TTS user preferences JSON. */
prefsPath?: string;
/** Hard cap for text sent to TTS (chars). */

View File

@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z
.strict()
.optional();
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]);
export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "gemini"]);
export const TtsModeSchema = z.enum(["final", "all"]);
export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]);
export const TtsConfigSchema = z
@ -224,6 +224,15 @@ export const TtsConfigSchema = z
})
.strict()
.optional(),
gemini: z
.object({
apiKey: z.string().optional(),
model: z.string().optional(),
voiceName: z.string().optional(),
baseUrl: z.string().optional(),
})
.strict()
.optional(),
prefsPath: z.string().optional(),
maxTextLength: z.number().int().min(1).optional(),
timeoutMs: z.number().int().min(1000).max(120000).optional(),

View File

@ -1,5 +1,6 @@
import { loadConfig } from "../../config/config.js";
import {
GEMINI_TTS_MODELS,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
getTtsProvider,
@ -38,6 +39,7 @@ export const ttsHandlers: GatewayRequestHandlers = {
prefsPath,
hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")),
hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")),
hasGeminiKey: Boolean(resolveTtsApiKey(config, "gemini")),
edgeEnabled: isTtsProviderConfigured(config, "edge"),
});
} catch (err) {
@ -100,13 +102,18 @@ export const ttsHandlers: GatewayRequestHandlers = {
},
"tts.setProvider": async ({ params, respond }) => {
const provider = typeof params.provider === "string" ? params.provider.trim() : "";
if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") {
if (
provider !== "openai" &&
provider !== "elevenlabs" &&
provider !== "edge" &&
provider !== "gemini"
) {
respond(
false,
undefined,
errorShape(
ErrorCodes.INVALID_REQUEST,
"Invalid provider. Use openai, elevenlabs, or edge.",
"Invalid provider. Use openai, elevenlabs, edge, or gemini.",
),
);
return;
@ -147,6 +154,12 @@ export const ttsHandlers: GatewayRequestHandlers = {
configured: isTtsProviderConfigured(config, "edge"),
models: [],
},
{
id: "gemini",
name: "Gemini",
configured: Boolean(resolveTtsApiKey(config, "gemini")),
models: [...GEMINI_TTS_MODELS],
},
],
active: getTtsProvider(config, prefsPath),
});

View File

@ -45,11 +45,13 @@ const {
isValidOpenAIModel,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
GEMINI_TTS_MODELS,
parseTtsDirectives,
resolveModelOverridePolicy,
summarizeText,
resolveOutputFormat,
resolveEdgeOutputFormat,
normalizeGeminiBaseUrl,
} = _test;
describe("tts", () => {
@ -136,10 +138,18 @@ describe("tts", () => {
});
});
describe("GEMINI_TTS_MODELS", () => {
it("includes Gemini TTS preview models", () => {
expect(GEMINI_TTS_MODELS).toContain("gemini-2.5-flash-preview-tts");
expect(GEMINI_TTS_MODELS).toContain("gemini-2.5-pro-preview-tts");
});
});
describe("resolveOutputFormat", () => {
it("uses Opus for Telegram", () => {
const output = resolveOutputFormat("telegram");
expect(output.openai).toBe("opus");
expect(output.gemini).toBe("opus");
expect(output.elevenlabs).toBe("opus_48000_64");
expect(output.extension).toBe(".opus");
expect(output.voiceCompatible).toBe(true);
@ -148,6 +158,7 @@ describe("tts", () => {
it("uses MP3 for other channels", () => {
const output = resolveOutputFormat("discord");
expect(output.openai).toBe("mp3");
expect(output.gemini).toBe("mp3");
expect(output.elevenlabs).toBe("mp3_44100_128");
expect(output.extension).toBe(".mp3");
expect(output.voiceCompatible).toBe(false);
@ -178,6 +189,24 @@ describe("tts", () => {
});
});
describe("normalizeGeminiBaseUrl", () => {
it("adds scheme and v1beta", () => {
expect(normalizeGeminiBaseUrl("example.com")).toBe("https://example.com/v1beta");
});
it("preserves v1beta when provided", () => {
expect(normalizeGeminiBaseUrl("https://example.com/v1beta")).toBe(
"https://example.com/v1beta",
);
});
it("appends v1beta to custom paths", () => {
expect(normalizeGeminiBaseUrl("https://example.com/foo")).toBe(
"https://example.com/foo/v1beta",
);
});
});
describe("parseTtsDirectives", () => {
it("extracts overrides and strips directives when enabled", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
@ -202,6 +231,17 @@ describe("tts", () => {
expect(result.overrides.provider).toBe("edge");
});
it("accepts Gemini provider overrides", () => {
const policy = resolveModelOverridePolicy({ enabled: true });
const input =
"Hello [[tts:provider=gemini voiceName=Kore model=gemini-2.5-flash-preview-tts]] world";
const result = parseTtsDirectives(input, policy);
expect(result.overrides.provider).toBe("gemini");
expect(result.overrides.gemini?.voiceName).toBe("Kore");
expect(result.overrides.gemini?.model).toBe("gemini-2.5-flash-preview-tts");
});
it("keeps text intact when overrides are disabled", () => {
const policy = resolveModelOverridePolicy({ enabled: false });
const input = "Hello [[tts:voice=alloy]] world";

View File

@ -8,6 +8,7 @@ import {
renameSync,
unlinkSync,
} from "node:fs";
import { spawn } from "node:child_process";
import { tmpdir } from "node:os";
import path from "node:path";
@ -51,6 +52,9 @@ const DEFAULT_OPENAI_VOICE = "alloy";
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
const DEFAULT_EDGE_LANG = "en-US";
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
const DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta";
const DEFAULT_GEMINI_MODEL = "gemini-2.5-flash-preview-tts";
const DEFAULT_GEMINI_VOICE = "Kore";
const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
stability: 0.5,
@ -62,6 +66,7 @@ const DEFAULT_ELEVENLABS_VOICE_SETTINGS = {
const TELEGRAM_OUTPUT = {
openai: "opus" as const,
gemini: "opus" as const,
// ElevenLabs output formats use codec_sample_rate_bitrate naming.
// Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram.
elevenlabs: "opus_48000_64",
@ -71,6 +76,7 @@ const TELEGRAM_OUTPUT = {
const DEFAULT_OUTPUT = {
openai: "mp3" as const,
gemini: "mp3" as const,
elevenlabs: "mp3_44100_128",
extension: ".mp3",
voiceCompatible: false,
@ -124,6 +130,12 @@ export type ResolvedTtsConfig = {
proxy?: string;
timeoutMs?: number;
};
gemini: {
apiKey?: string;
model: string;
voiceName: string;
baseUrl: string;
};
prefsPath?: string;
maxTextLength: number;
timeoutMs: number;
@ -157,6 +169,10 @@ type TtsDirectiveOverrides = {
voice?: string;
model?: string;
};
gemini?: {
voiceName?: string;
model?: string;
};
elevenlabs?: {
voiceId?: string;
modelId?: string;
@ -296,6 +312,12 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
proxy: raw.edge?.proxy?.trim() || undefined,
timeoutMs: raw.edge?.timeoutMs,
},
gemini: {
apiKey: raw.gemini?.apiKey,
model: normalizeGeminiModel(raw.gemini?.model),
voiceName: normalizeGeminiVoiceName(raw.gemini?.voiceName),
baseUrl: normalizeGeminiBaseUrl(raw.gemini?.baseUrl),
},
prefsPath: raw.prefsPath,
maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH,
timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS,
@ -474,12 +496,18 @@ export function resolveTtsApiKey(
if (provider === "openai") {
return config.openai.apiKey || process.env.OPENAI_API_KEY;
}
if (provider === "gemini") {
return config.gemini.apiKey || process.env.GEMINI_API_KEY;
}
return undefined;
}
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const;
export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge", "gemini"] as const;
export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] {
if (primary === "gemini") {
return ["gemini", "elevenlabs", "openai", "edge"];
}
return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)];
}
@ -498,6 +526,45 @@ function normalizeElevenLabsBaseUrl(baseUrl: string): string {
return trimmed.replace(/\/+$/, "");
}
function isSafeGeminiValue(value: string): boolean {
return /^[a-zA-Z0-9._-]+$/.test(value);
}
function normalizeGeminiBaseUrl(baseUrl?: string): string {
const trimmed = baseUrl?.trim();
if (!trimmed) return DEFAULT_GEMINI_BASE_URL;
const withScheme = /^https?:\/\//i.test(trimmed) ? trimmed : `https://${trimmed}`;
const url = new URL(withScheme);
let pathname = url.pathname.replace(/\/+$/, "");
if (!pathname || pathname === "/") {
pathname = "/v1beta";
} else if (!pathname.endsWith("/v1beta")) {
pathname = `${pathname}/v1beta`;
}
url.pathname = pathname;
url.search = "";
url.hash = "";
return url.toString().replace(/\/+$/, "");
}
function normalizeGeminiModel(model?: string): string {
const trimmed = model?.trim();
if (!trimmed) return DEFAULT_GEMINI_MODEL;
if (!isSafeGeminiValue(trimmed)) {
throw new Error("Gemini model contains invalid characters");
}
return trimmed;
}
function normalizeGeminiVoiceName(voiceName?: string): string {
const trimmed = voiceName?.trim();
if (!trimmed) return DEFAULT_GEMINI_VOICE;
if (!isSafeGeminiValue(trimmed)) {
throw new Error("Gemini voiceName contains invalid characters");
}
return trimmed;
}
function requireInRange(value: number, min: number, max: number, label: string): void {
if (!Number.isFinite(value) || value < min || value > max) {
throw new Error(`${label} must be between ${min} and ${max}`);
@ -576,6 +643,24 @@ function parseTtsDirectives(
cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => {
hasDirective = true;
const tokens = body.split(/\s+/).filter(Boolean);
const providerOverrideFromTokens = (() => {
for (const token of tokens) {
const eqIndex = token.indexOf("=");
if (eqIndex === -1) continue;
const rawKey = token.slice(0, eqIndex).trim().toLowerCase();
if (rawKey !== "provider") continue;
const rawValue = token.slice(eqIndex + 1).trim();
if (
rawValue === "openai" ||
rawValue === "elevenlabs" ||
rawValue === "edge" ||
rawValue === "gemini"
) {
return rawValue;
}
}
return undefined;
})();
for (const token of tokens) {
const eqIndex = token.indexOf("=");
if (eqIndex === -1) continue;
@ -587,7 +672,12 @@ function parseTtsDirectives(
switch (key) {
case "provider":
if (!policy.allowProvider) break;
if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") {
if (
rawValue === "openai" ||
rawValue === "elevenlabs" ||
rawValue === "edge" ||
rawValue === "gemini"
) {
overrides.provider = rawValue;
} else {
warnings.push(`unsupported provider "${rawValue}"`);
@ -624,10 +714,38 @@ function parseTtsDirectives(
if (!policy.allowModelId) break;
if (isValidOpenAIModel(rawValue)) {
overrides.openai = { ...overrides.openai, model: rawValue };
} else if (providerOverrideFromTokens === "gemini") {
if (isSafeGeminiValue(rawValue)) {
overrides.gemini = { ...overrides.gemini, model: rawValue };
} else {
warnings.push(`invalid Gemini model "${rawValue}"`);
}
} else if (isGeminiModelToken(rawValue)) {
overrides.gemini = { ...overrides.gemini, model: rawValue };
} else {
overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue };
}
break;
case "gemini_model":
case "geminimodel":
if (!policy.allowModelId) break;
if (isSafeGeminiValue(rawValue)) {
overrides.gemini = { ...overrides.gemini, model: rawValue };
} else {
warnings.push(`invalid Gemini model "${rawValue}"`);
}
break;
case "voicename":
case "voice_name":
case "gemini_voice":
case "geminivoice":
if (!policy.allowVoice) break;
if (isSafeGeminiValue(rawValue)) {
overrides.gemini = { ...overrides.gemini, voiceName: rawValue };
} else {
warnings.push(`invalid Gemini voiceName "${rawValue}"`);
}
break;
case "stability":
if (!policy.allowVoiceSettings) break;
{
@ -774,6 +892,11 @@ export const OPENAI_TTS_VOICES = [
"shimmer",
] as const;
export const GEMINI_TTS_MODELS = [
"gemini-2.5-flash-preview-tts",
"gemini-2.5-pro-preview-tts",
] as const;
type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
function isValidOpenAIModel(model: string): boolean {
@ -782,6 +905,10 @@ function isValidOpenAIModel(model: string): boolean {
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
function isGeminiModelToken(model: string): boolean {
return model.startsWith("gemini-") && isSafeGeminiValue(model);
}
function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
if (isCustomOpenAIEndpoint) return true;
@ -1068,6 +1195,143 @@ async function edgeTTS(params: {
await tts.ttsPromise(text, outputPath);
}
async function geminiTTS(params: {
text: string;
apiKey: string;
model: string;
voiceName: string;
baseUrl: string;
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, model, voiceName, baseUrl, timeoutMs } = params;
if (!isSafeGeminiValue(model)) {
throw new Error(`Gemini model contains invalid characters: ${model}`);
}
if (!isSafeGeminiValue(voiceName)) {
throw new Error(`Gemini voiceName contains invalid characters: ${voiceName}`);
}
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
const url = new URL(
`${normalizeGeminiBaseUrl(baseUrl)}/models/${encodeURIComponent(model)}:generateContent`,
);
const response = await fetch(url.toString(), {
method: "POST",
headers: {
"x-goog-api-key": apiKey,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
contents: [{ parts: [{ text }] }],
generationConfig: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: { voiceName },
},
},
},
}),
signal: controller.signal,
});
if (!response.ok) {
throw new Error(`Gemini TTS API error (${response.status})`);
}
const payload = (await response.json()) as {
candidates?: Array<{
content?: {
parts?: Array<{ inlineData?: { data?: string } }>;
};
}>;
};
const data = payload?.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data?.trim();
if (!data) {
throw new Error("Gemini TTS response missing audio data");
}
return Buffer.from(data, "base64");
} finally {
clearTimeout(timeout);
}
}
async function transcodePcmWithFfmpeg(params: {
pcm: Buffer;
format: "mp3" | "opus";
timeoutMs: number;
}): Promise<Buffer> {
const { pcm, format, timeoutMs } = params;
const args = [
"-hide_banner",
"-loglevel",
"error",
"-f",
"s16le",
"-ar",
"24000",
"-ac",
"1",
"-i",
"pipe:0",
];
if (format === "opus") {
args.push("-f", "opus", "-c:a", "libopus", "-b:a", "64k", "-ar", "48000", "-ac", "1");
} else {
args.push("-f", "mp3", "-b:a", "128k", "-ar", "44100", "-ac", "1");
}
args.push("pipe:1");
return await new Promise((resolve, reject) => {
const child = spawn("ffmpeg", args, {
stdio: ["pipe", "pipe", "pipe"],
});
const stdoutChunks: Buffer[] = [];
let stderr = "";
let settled = false;
const timer = setTimeout(() => {
child.kill("SIGKILL");
}, timeoutMs);
child.on("error", (err) => {
if (settled) return;
settled = true;
clearTimeout(timer);
reject(err);
});
child.stdout?.on("data", (chunk) => {
stdoutChunks.push(Buffer.from(chunk));
});
child.stderr?.on("data", (chunk) => {
stderr += chunk.toString();
});
child.on("close", (code) => {
if (settled) return;
settled = true;
clearTimeout(timer);
if (code !== 0) {
reject(new Error(stderr.trim() || `ffmpeg exited with code ${code ?? "unknown"}`));
return;
}
resolve(Buffer.concat(stdoutChunks));
});
child.stdin?.write(pcm);
child.stdin?.end();
});
}
export async function textToSpeech(params: {
text: string;
cfg: ClawdbotConfig;
@ -1172,7 +1436,31 @@ export async function textToSpeech(params: {
}
let audioBuffer: Buffer;
if (provider === "elevenlabs") {
if (provider === "gemini") {
const geminiModelOverride = params.overrides?.gemini?.model;
const geminiVoiceOverride = params.overrides?.gemini?.voiceName;
const pcmBuffer = await geminiTTS({
text: params.text,
apiKey,
model: geminiModelOverride ?? config.gemini.model,
voiceName: geminiVoiceOverride ?? config.gemini.voiceName,
baseUrl: config.gemini.baseUrl,
timeoutMs: config.timeoutMs,
});
try {
audioBuffer = await transcodePcmWithFfmpeg({
pcm: pcmBuffer,
format: output.gemini,
timeoutMs: config.timeoutMs,
});
} catch (err) {
const error = err as NodeJS.ErrnoException;
if (error.code === "ENOENT") {
throw new Error("ffmpeg not found; Gemini TTS requires ffmpeg");
}
throw err;
}
} else if (provider === "elevenlabs") {
const voiceIdOverride = params.overrides?.elevenlabs?.voiceId;
const modelIdOverride = params.overrides?.elevenlabs?.modelId;
const voiceSettings = {
@ -1220,7 +1508,12 @@ export async function textToSpeech(params: {
audioPath,
latencyMs,
provider,
outputFormat: provider === "openai" ? output.openai : output.elevenlabs,
outputFormat:
provider === "openai"
? output.openai
: provider === "gemini"
? output.gemini
: output.elevenlabs,
voiceCompatible: output.voiceCompatible,
};
} catch (err) {
@ -1463,9 +1756,11 @@ export const _test = {
isValidOpenAIModel,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
GEMINI_TTS_MODELS,
parseTtsDirectives,
resolveModelOverridePolicy,
summarizeText,
resolveOutputFormat,
resolveEdgeOutputFormat,
normalizeGeminiBaseUrl,
};