diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 1d270974d..1635dda8d 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1498,7 +1498,7 @@ active agent’s `identity.emoji` when set, otherwise `"👀"`. Set it to `""` t #### `messages.tts` Enable text-to-speech for outbound replies. When on, Moltbot generates audio -using ElevenLabs or OpenAI and attaches it to responses. Telegram uses Opus +using ElevenLabs, OpenAI or Gemini and attaches it to responses. Telegram uses Opus voice notes; other channels send MP3 audio. ```json5 @@ -1535,6 +1535,17 @@ voice notes; other channels send MP3 audio. apiKey: "openai_api_key", model: "gpt-4o-mini-tts", voice: "alloy" + }, + edge: { + enabled: true, + voice: "en-US-MichelleNeural", + lang: "en-US" + }, + gemini: { + apiKey: "gemini_api_key", + model: "gemini-2.5-flash-preview-tts", + voiceName: "Kore", + baseUrl: "generativelanguage.googleapis.com" } } } @@ -1550,11 +1561,18 @@ Notes: - `summaryModel` overrides `agents.defaults.model.primary` for auto-summary. - Accepts `provider/model` or an alias from `agents.defaults.models`. - `modelOverrides` enables model-driven overrides like `[[tts:...]]` tags (on by default). +- `provider`: `"elevenlabs"`, `"openai"`, `"edge"`, or `"gemini"`. + - If `provider` is unset, Clawdbot picks OpenAI when configured, then ElevenLabs, then Edge. + - Gemini falls back to ElevenLabs, then OpenAI, then Edge (if enabled). - `/tts limit` and `/tts summary` control per-user summarization settings. -- `apiKey` values fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY` and `OPENAI_API_KEY`. +- `apiKey` values fall back to `ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`, and `GEMINI_API_KEY`. - `elevenlabs.baseUrl` overrides the ElevenLabs API base URL. - `elevenlabs.voiceSettings` supports `stability`/`similarityBoost`/`style` (0..1), `useSpeakerBoost`, and `speed` (0.5..2.0). +- `edge.enabled` toggles Edge TTS (no API key required). +- `edge.outputFormat` sets the Edge TTS output format; defaults to MP3 if omitted. +- `gemini.baseUrl` adds `/v1beta` if missing. +- Gemini TTS requires `ffmpeg` to transcode PCM into MP3/Opus. ### `talk` diff --git a/docs/tts.md b/docs/tts.md index c3899ac08..3fb91301a 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -8,7 +8,7 @@ read_when: # Text-to-speech (TTS) -Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, or Edge TTS. +Moltbot can convert outbound replies into audio using ElevenLabs, OpenAI, Edge TTS, or Gemini. It works anywhere Moltbot can send audio; Telegram gets a round voice-note bubble. ## Supported services @@ -16,29 +16,32 @@ It works anywhere Moltbot can send audio; Telegram gets a round voice-note bubbl - **ElevenLabs** (primary or fallback provider) - **OpenAI** (primary or fallback provider; also used for summaries) - **Edge TTS** (primary or fallback provider; uses `node-edge-tts`, default when no API keys) +- **Gemini** (primary provider with fallback to ElevenLabs, then OpenAI) ### Edge TTS notes Edge TTS uses Microsoft Edge's online neural TTS service via the `node-edge-tts` library. It's a hosted service (not local), uses Microsoft’s endpoints, and does not require an API key. `node-edge-tts` exposes speech configuration options and -output formats, but not all options are supported by the Edge service. citeturn2search0 + output formats, but not all options are supported by the Edge service. Because Edge TTS is a public web service without a published SLA or quota, treat it as best-effort. If you need guaranteed limits and support, use OpenAI or ElevenLabs. Microsoft's Speech REST API documents a 10‑minute audio limit per request; Edge TTS -does not publish limits, so assume similar or lower limits. citeturn0search3 + does not publish limits, so assume similar or lower limits. ## Optional keys -If you want OpenAI or ElevenLabs: +If you want OpenAI, ElevenLabs, or Gemini: - `ELEVENLABS_API_KEY` (or `XI_API_KEY`) - `OPENAI_API_KEY` +- `GEMINI_API_KEY` Edge TTS does **not** require an API key. If no API keys are found, Moltbot defaults to Edge TTS (unless disabled via `messages.tts.edge.enabled=false`). If multiple providers are configured, the selected provider is used first and the others are fallback options. +Gemini falls back to ElevenLabs, then OpenAI, then Edge (if enabled). Auto-summary uses the configured `summaryModel` (or `agents.defaults.model.primary`), so that provider must also be authenticated if you enable summaries. @@ -50,6 +53,9 @@ so that provider must also be authenticated if you enable summaries. - [ElevenLabs Authentication](https://elevenlabs.io/docs/api-reference/authentication) - [node-edge-tts](https://github.com/SchneeHertz/node-edge-tts) - [Microsoft Speech output formats](https://learn.microsoft.com/azure/ai-services/speech-service/rest-text-to-speech#audio-outputs) +- [Gemini speech generation](https://ai.google.dev/gemini-api/docs/speech-generation) +- [Gemini TTS voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices) +- [Gemini TTS models](https://ai.google.dev/gemini-api/docs/speech-generation#supported-models) ## Is it enabled by default? @@ -136,6 +142,25 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### Gemini primary + +```json5 +{ + messages: { + tts: { + enabled: true, + provider: "gemini", + gemini: { + apiKey: "gemini_api_key", + model: "gemini-2.5-flash-preview-tts", + voiceName: "Kore", + baseUrl: "generativelanguage.googleapis.com" + } + } + } +} +``` + ### Disable Edge TTS ```json5 @@ -202,17 +227,20 @@ Then run: - `tagged` only sends audio when the reply includes `[[tts]]` tags. - `enabled`: legacy toggle (doctor migrates this to `auto`). - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). -- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic). -- If `provider` is **unset**, Moltbot prefers `openai` (if key), then `elevenlabs` (if key), - otherwise `edge`. +- `provider`: `"elevenlabs"`, `"openai"`, `"edge"`, or `"gemini"`. + - If `provider` is **unset**, Moltbot picks OpenAI when configured, then ElevenLabs, then Edge. + - Gemini falls back to ElevenLabs, then OpenAI, then Edge (if enabled). - `summaryModel`: optional cheap model for auto-summary; defaults to `agents.defaults.model.primary`. - Accepts `provider/model` or a configured model alias. - `modelOverrides`: allow the model to emit TTS directives (on by default). - `maxTextLength`: hard cap for TTS input (chars). `/tts audio` fails if exceeded. - `timeoutMs`: request timeout (ms). -- `prefsPath`: override the local prefs JSON path (provider/limit/summary). -- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`). +- `prefsPath`: override the local prefs JSON path. +- `apiKey` values fall back to env vars (`ELEVENLABS_API_KEY`/`XI_API_KEY`, `OPENAI_API_KEY`, `GEMINI_API_KEY`). - `elevenlabs.baseUrl`: override ElevenLabs API base URL. +- `gemini.baseUrl`: base URL for the Gemini API; if it doesn't end in `/v1beta`, Clawdbot appends it. +- `gemini.voiceName`: prebuilt Gemini voice name (free-form; see Gemini docs for the list). +- `gemini.model`: Gemini TTS model name (free-form). - `elevenlabs.voiceSettings`: - `stability`, `similarityBoost`, `style`: `0..1` - `useSpeakerBoost`: `true|false` @@ -250,9 +278,9 @@ Here you go. ``` Available directive keys (when enabled): -- `provider` (`openai` | `elevenlabs` | `edge`) -- `voice` (OpenAI voice) or `voiceId` (ElevenLabs) -- `model` (OpenAI TTS model or ElevenLabs model id) +- `provider` (`openai` | `elevenlabs` | `edge` | `gemini`) +- `voice` (OpenAI voice), `voiceId` (ElevenLabs), or `voiceName` (Gemini) +- `model` (OpenAI TTS model, ElevenLabs model id, or Gemini model name) - `stability`, `similarityBoost`, `style`, `speed`, `useSpeakerBoost` - `applyTextNormalization` (`auto|on|off`) - `languageCode` (ISO 639-1) @@ -304,18 +332,20 @@ These override `messages.tts.*` for that host. ## Output formats (fixed) -- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI). +- **Telegram**: Opus voice note (`opus_48000_64` from ElevenLabs, `opus` from OpenAI/Gemini). - 48kHz / 64kbps is a good voice-note tradeoff and required for the round bubble. -- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI). +- **Other channels**: MP3 (`mp3_44100_128` from ElevenLabs, `mp3` from OpenAI/Gemini). - 44.1kHz / 128kbps is the default balance for speech clarity. - **Edge TTS**: uses `edge.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). - `node-edge-tts` accepts an `outputFormat`, but not all formats are available - from the Edge service. citeturn2search0 - - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). citeturn1search0 + from the Edge service. + - Output format values follow Microsoft Speech output formats (including Ogg/WebM Opus). - Telegram `sendVoice` accepts OGG/MP3/M4A; use OpenAI/ElevenLabs if you need - guaranteed Opus voice notes. citeturn1search1 + guaranteed Opus voice notes. - If the configured Edge output format fails, Moltbot retries with MP3. +Gemini returns raw PCM audio and requires `ffmpeg` to transcode into MP3/Opus. + OpenAI/ElevenLabs formats are fixed; Telegram expects Opus for voice-note UX. ## Auto-TTS behavior @@ -359,7 +389,7 @@ Discord note: `/tts` is a built-in Discord command, so Moltbot registers /tts inbound /tts tagged /tts status -/tts provider openai +/tts provider gemini /tts limit 2000 /tts summary off /tts audio Hello from Moltbot diff --git a/src/auto-reply/reply/commands-tts.ts b/src/auto-reply/reply/commands-tts.ts index 04b60a4e9..21f7fb179 100644 --- a/src/auto-reply/reply/commands-tts.ts +++ b/src/auto-reply/reply/commands-tts.ts @@ -51,6 +51,7 @@ function ttsUsage(): ReplyPayload { `• edge — Free, fast (default)\n` + `• openai — High quality (requires API key)\n` + `• elevenlabs — Premium voices (requires API key)\n\n` + + `• gemini — High quality, offordable (requires API key)\n\n` + `**Text Limit (default: 1500, max: 4096):**\n` + `When text exceeds the limit:\n` + `• Summary ON: AI summarizes, then generates audio\n` + @@ -152,6 +153,7 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand const hasOpenAI = Boolean(resolveTtsApiKey(config, "openai")); const hasElevenLabs = Boolean(resolveTtsApiKey(config, "elevenlabs")); const hasEdge = isTtsProviderConfigured(config, "edge"); + const hasGemini = Boolean(resolveTtsApiKey(config, "gemini")); return { shouldContinue: false, reply: { @@ -161,13 +163,19 @@ export const handleTtsCommands: CommandHandler = async (params, allowTextCommand `OpenAI key: ${hasOpenAI ? "✅" : "❌"}\n` + `ElevenLabs key: ${hasElevenLabs ? "✅" : "❌"}\n` + `Edge enabled: ${hasEdge ? "✅" : "❌"}\n` + - `Usage: /tts provider openai | elevenlabs | edge`, + `Gemini key: ${hasGemini ? "✅" : "❌"}\n` + + `Usage: /tts provider openai | elevenlabs | edge | gemini`, }, }; } const requested = args.trim().toLowerCase(); - if (requested !== "openai" && requested !== "elevenlabs" && requested !== "edge") { + if ( + requested !== "openai" && + requested !== "elevenlabs" && + requested !== "edge" && + requested !== "gemini" + ) { return { shouldContinue: false, reply: ttsUsage() }; } diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 4eb4989b9..7c2ea0cac 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -1,4 +1,4 @@ -export type TtsProvider = "elevenlabs" | "openai" | "edge"; +export type TtsProvider = "elevenlabs" | "openai" | "edge" | "gemini"; export type TtsMode = "final" | "all"; @@ -73,6 +73,13 @@ export type TtsConfig = { proxy?: string; timeoutMs?: number; }; + /** Gemini configuration. */ + gemini?: { + apiKey?: string; + model?: string; + voiceName?: string; + baseUrl?: string; + }; /** Optional path for local TTS user preferences JSON. */ prefsPath?: string; /** Hard cap for text sent to TTS (chars). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index 4a8c80bcc..aa6a4f745 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -156,7 +156,7 @@ export const MarkdownConfigSchema = z .strict() .optional(); -export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge"]); +export const TtsProviderSchema = z.enum(["elevenlabs", "openai", "edge", "gemini"]); export const TtsModeSchema = z.enum(["final", "all"]); export const TtsAutoSchema = z.enum(["off", "always", "inbound", "tagged"]); export const TtsConfigSchema = z @@ -224,6 +224,15 @@ export const TtsConfigSchema = z }) .strict() .optional(), + gemini: z + .object({ + apiKey: z.string().optional(), + model: z.string().optional(), + voiceName: z.string().optional(), + baseUrl: z.string().optional(), + }) + .strict() + .optional(), prefsPath: z.string().optional(), maxTextLength: z.number().int().min(1).optional(), timeoutMs: z.number().int().min(1000).max(120000).optional(), diff --git a/src/gateway/server-methods/tts.ts b/src/gateway/server-methods/tts.ts index 5e4e8254e..5e73b170a 100644 --- a/src/gateway/server-methods/tts.ts +++ b/src/gateway/server-methods/tts.ts @@ -1,5 +1,6 @@ import { loadConfig } from "../../config/config.js"; import { + GEMINI_TTS_MODELS, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, getTtsProvider, @@ -38,6 +39,7 @@ export const ttsHandlers: GatewayRequestHandlers = { prefsPath, hasOpenAIKey: Boolean(resolveTtsApiKey(config, "openai")), hasElevenLabsKey: Boolean(resolveTtsApiKey(config, "elevenlabs")), + hasGeminiKey: Boolean(resolveTtsApiKey(config, "gemini")), edgeEnabled: isTtsProviderConfigured(config, "edge"), }); } catch (err) { @@ -100,13 +102,18 @@ export const ttsHandlers: GatewayRequestHandlers = { }, "tts.setProvider": async ({ params, respond }) => { const provider = typeof params.provider === "string" ? params.provider.trim() : ""; - if (provider !== "openai" && provider !== "elevenlabs" && provider !== "edge") { + if ( + provider !== "openai" && + provider !== "elevenlabs" && + provider !== "edge" && + provider !== "gemini" + ) { respond( false, undefined, errorShape( ErrorCodes.INVALID_REQUEST, - "Invalid provider. Use openai, elevenlabs, or edge.", + "Invalid provider. Use openai, elevenlabs, edge, or gemini.", ), ); return; @@ -147,6 +154,12 @@ export const ttsHandlers: GatewayRequestHandlers = { configured: isTtsProviderConfigured(config, "edge"), models: [], }, + { + id: "gemini", + name: "Gemini", + configured: Boolean(resolveTtsApiKey(config, "gemini")), + models: [...GEMINI_TTS_MODELS], + }, ], active: getTtsProvider(config, prefsPath), }); diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index 8462cba01..7a0d4e287 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -45,11 +45,13 @@ const { isValidOpenAIModel, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, + GEMINI_TTS_MODELS, parseTtsDirectives, resolveModelOverridePolicy, summarizeText, resolveOutputFormat, resolveEdgeOutputFormat, + normalizeGeminiBaseUrl, } = _test; describe("tts", () => { @@ -136,10 +138,18 @@ describe("tts", () => { }); }); + describe("GEMINI_TTS_MODELS", () => { + it("includes Gemini TTS preview models", () => { + expect(GEMINI_TTS_MODELS).toContain("gemini-2.5-flash-preview-tts"); + expect(GEMINI_TTS_MODELS).toContain("gemini-2.5-pro-preview-tts"); + }); + }); + describe("resolveOutputFormat", () => { it("uses Opus for Telegram", () => { const output = resolveOutputFormat("telegram"); expect(output.openai).toBe("opus"); + expect(output.gemini).toBe("opus"); expect(output.elevenlabs).toBe("opus_48000_64"); expect(output.extension).toBe(".opus"); expect(output.voiceCompatible).toBe(true); @@ -148,6 +158,7 @@ describe("tts", () => { it("uses MP3 for other channels", () => { const output = resolveOutputFormat("discord"); expect(output.openai).toBe("mp3"); + expect(output.gemini).toBe("mp3"); expect(output.elevenlabs).toBe("mp3_44100_128"); expect(output.extension).toBe(".mp3"); expect(output.voiceCompatible).toBe(false); @@ -178,6 +189,24 @@ describe("tts", () => { }); }); + describe("normalizeGeminiBaseUrl", () => { + it("adds scheme and v1beta", () => { + expect(normalizeGeminiBaseUrl("example.com")).toBe("https://example.com/v1beta"); + }); + + it("preserves v1beta when provided", () => { + expect(normalizeGeminiBaseUrl("https://example.com/v1beta")).toBe( + "https://example.com/v1beta", + ); + }); + + it("appends v1beta to custom paths", () => { + expect(normalizeGeminiBaseUrl("https://example.com/foo")).toBe( + "https://example.com/foo/v1beta", + ); + }); + }); + describe("parseTtsDirectives", () => { it("extracts overrides and strips directives when enabled", () => { const policy = resolveModelOverridePolicy({ enabled: true }); @@ -202,6 +231,17 @@ describe("tts", () => { expect(result.overrides.provider).toBe("edge"); }); + it("accepts Gemini provider overrides", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = + "Hello [[tts:provider=gemini voiceName=Kore model=gemini-2.5-flash-preview-tts]] world"; + const result = parseTtsDirectives(input, policy); + + expect(result.overrides.provider).toBe("gemini"); + expect(result.overrides.gemini?.voiceName).toBe("Kore"); + expect(result.overrides.gemini?.model).toBe("gemini-2.5-flash-preview-tts"); + }); + it("keeps text intact when overrides are disabled", () => { const policy = resolveModelOverridePolicy({ enabled: false }); const input = "Hello [[tts:voice=alloy]] world"; diff --git a/src/tts/tts.ts b/src/tts/tts.ts index faa83d3a6..4c8ec8ea9 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -8,6 +8,7 @@ import { renameSync, unlinkSync, } from "node:fs"; +import { spawn } from "node:child_process"; import { tmpdir } from "node:os"; import path from "node:path"; @@ -51,6 +52,9 @@ const DEFAULT_OPENAI_VOICE = "alloy"; const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural"; const DEFAULT_EDGE_LANG = "en-US"; const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; +const DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"; +const DEFAULT_GEMINI_MODEL = "gemini-2.5-flash-preview-tts"; +const DEFAULT_GEMINI_VOICE = "Kore"; const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { stability: 0.5, @@ -62,6 +66,7 @@ const DEFAULT_ELEVENLABS_VOICE_SETTINGS = { const TELEGRAM_OUTPUT = { openai: "opus" as const, + gemini: "opus" as const, // ElevenLabs output formats use codec_sample_rate_bitrate naming. // Opus @ 48kHz/64kbps is a good voice-note tradeoff for Telegram. elevenlabs: "opus_48000_64", @@ -71,6 +76,7 @@ const TELEGRAM_OUTPUT = { const DEFAULT_OUTPUT = { openai: "mp3" as const, + gemini: "mp3" as const, elevenlabs: "mp3_44100_128", extension: ".mp3", voiceCompatible: false, @@ -124,6 +130,12 @@ export type ResolvedTtsConfig = { proxy?: string; timeoutMs?: number; }; + gemini: { + apiKey?: string; + model: string; + voiceName: string; + baseUrl: string; + }; prefsPath?: string; maxTextLength: number; timeoutMs: number; @@ -157,6 +169,10 @@ type TtsDirectiveOverrides = { voice?: string; model?: string; }; + gemini?: { + voiceName?: string; + model?: string; + }; elevenlabs?: { voiceId?: string; modelId?: string; @@ -296,6 +312,12 @@ export function resolveTtsConfig(cfg: MoltbotConfig): ResolvedTtsConfig { proxy: raw.edge?.proxy?.trim() || undefined, timeoutMs: raw.edge?.timeoutMs, }, + gemini: { + apiKey: raw.gemini?.apiKey, + model: normalizeGeminiModel(raw.gemini?.model), + voiceName: normalizeGeminiVoiceName(raw.gemini?.voiceName), + baseUrl: normalizeGeminiBaseUrl(raw.gemini?.baseUrl), + }, prefsPath: raw.prefsPath, maxTextLength: raw.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH, timeoutMs: raw.timeoutMs ?? DEFAULT_TIMEOUT_MS, @@ -474,12 +496,18 @@ export function resolveTtsApiKey( if (provider === "openai") { return config.openai.apiKey || process.env.OPENAI_API_KEY; } + if (provider === "gemini") { + return config.gemini.apiKey || process.env.GEMINI_API_KEY; + } return undefined; } -export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge"] as const; +export const TTS_PROVIDERS = ["openai", "elevenlabs", "edge", "gemini"] as const; export function resolveTtsProviderOrder(primary: TtsProvider): TtsProvider[] { + if (primary === "gemini") { + return ["gemini", "elevenlabs", "openai", "edge"]; + } return [primary, ...TTS_PROVIDERS.filter((provider) => provider !== primary)]; } @@ -498,6 +526,45 @@ function normalizeElevenLabsBaseUrl(baseUrl: string): string { return trimmed.replace(/\/+$/, ""); } +function isSafeGeminiValue(value: string): boolean { + return /^[a-zA-Z0-9._-]+$/.test(value); +} + +function normalizeGeminiBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) return DEFAULT_GEMINI_BASE_URL; + const withScheme = /^https?:\/\//i.test(trimmed) ? trimmed : `https://${trimmed}`; + const url = new URL(withScheme); + let pathname = url.pathname.replace(/\/+$/, ""); + if (!pathname || pathname === "/") { + pathname = "/v1beta"; + } else if (!pathname.endsWith("/v1beta")) { + pathname = `${pathname}/v1beta`; + } + url.pathname = pathname; + url.search = ""; + url.hash = ""; + return url.toString().replace(/\/+$/, ""); +} + +function normalizeGeminiModel(model?: string): string { + const trimmed = model?.trim(); + if (!trimmed) return DEFAULT_GEMINI_MODEL; + if (!isSafeGeminiValue(trimmed)) { + throw new Error("Gemini model contains invalid characters"); + } + return trimmed; +} + +function normalizeGeminiVoiceName(voiceName?: string): string { + const trimmed = voiceName?.trim(); + if (!trimmed) return DEFAULT_GEMINI_VOICE; + if (!isSafeGeminiValue(trimmed)) { + throw new Error("Gemini voiceName contains invalid characters"); + } + return trimmed; +} + function requireInRange(value: number, min: number, max: number, label: string): void { if (!Number.isFinite(value) || value < min || value > max) { throw new Error(`${label} must be between ${min} and ${max}`); @@ -576,6 +643,24 @@ function parseTtsDirectives( cleanedText = cleanedText.replace(directiveRegex, (_match, body: string) => { hasDirective = true; const tokens = body.split(/\s+/).filter(Boolean); + const providerOverrideFromTokens = (() => { + for (const token of tokens) { + const eqIndex = token.indexOf("="); + if (eqIndex === -1) continue; + const rawKey = token.slice(0, eqIndex).trim().toLowerCase(); + if (rawKey !== "provider") continue; + const rawValue = token.slice(eqIndex + 1).trim(); + if ( + rawValue === "openai" || + rawValue === "elevenlabs" || + rawValue === "edge" || + rawValue === "gemini" + ) { + return rawValue; + } + } + return undefined; + })(); for (const token of tokens) { const eqIndex = token.indexOf("="); if (eqIndex === -1) continue; @@ -587,7 +672,12 @@ function parseTtsDirectives( switch (key) { case "provider": if (!policy.allowProvider) break; - if (rawValue === "openai" || rawValue === "elevenlabs" || rawValue === "edge") { + if ( + rawValue === "openai" || + rawValue === "elevenlabs" || + rawValue === "edge" || + rawValue === "gemini" + ) { overrides.provider = rawValue; } else { warnings.push(`unsupported provider "${rawValue}"`); @@ -624,10 +714,38 @@ function parseTtsDirectives( if (!policy.allowModelId) break; if (isValidOpenAIModel(rawValue)) { overrides.openai = { ...overrides.openai, model: rawValue }; + } else if (providerOverrideFromTokens === "gemini") { + if (isSafeGeminiValue(rawValue)) { + overrides.gemini = { ...overrides.gemini, model: rawValue }; + } else { + warnings.push(`invalid Gemini model "${rawValue}"`); + } + } else if (isGeminiModelToken(rawValue)) { + overrides.gemini = { ...overrides.gemini, model: rawValue }; } else { overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue }; } break; + case "gemini_model": + case "geminimodel": + if (!policy.allowModelId) break; + if (isSafeGeminiValue(rawValue)) { + overrides.gemini = { ...overrides.gemini, model: rawValue }; + } else { + warnings.push(`invalid Gemini model "${rawValue}"`); + } + break; + case "voicename": + case "voice_name": + case "gemini_voice": + case "geminivoice": + if (!policy.allowVoice) break; + if (isSafeGeminiValue(rawValue)) { + overrides.gemini = { ...overrides.gemini, voiceName: rawValue }; + } else { + warnings.push(`invalid Gemini voiceName "${rawValue}"`); + } + break; case "stability": if (!policy.allowVoiceSettings) break; { @@ -782,6 +900,11 @@ export const OPENAI_TTS_VOICES = [ "shimmer", ] as const; +export const GEMINI_TTS_MODELS = [ + "gemini-2.5-flash-preview-tts", + "gemini-2.5-pro-preview-tts", +] as const; + type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number]; function isValidOpenAIModel(model: string): boolean { @@ -790,6 +913,10 @@ function isValidOpenAIModel(model: string): boolean { return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); } +function isGeminiModelToken(model: string): boolean { + return model.startsWith("gemini-") && isSafeGeminiValue(model); +} + function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice { // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices) if (isCustomOpenAIEndpoint()) return true; @@ -1076,6 +1203,143 @@ async function edgeTTS(params: { await tts.ttsPromise(text, outputPath); } +async function geminiTTS(params: { + text: string; + apiKey: string; + model: string; + voiceName: string; + baseUrl: string; + timeoutMs: number; +}): Promise { + const { text, apiKey, model, voiceName, baseUrl, timeoutMs } = params; + + if (!isSafeGeminiValue(model)) { + throw new Error(`Gemini model contains invalid characters: ${model}`); + } + if (!isSafeGeminiValue(voiceName)) { + throw new Error(`Gemini voiceName contains invalid characters: ${voiceName}`); + } + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + + try { + const url = new URL( + `${normalizeGeminiBaseUrl(baseUrl)}/models/${encodeURIComponent(model)}:generateContent`, + ); + const response = await fetch(url.toString(), { + method: "POST", + headers: { + "x-goog-api-key": apiKey, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + contents: [{ parts: [{ text }] }], + generationConfig: { + responseModalities: ["AUDIO"], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName }, + }, + }, + }, + }), + signal: controller.signal, + }); + + if (!response.ok) { + throw new Error(`Gemini TTS API error (${response.status})`); + } + + const payload = (await response.json()) as { + candidates?: Array<{ + content?: { + parts?: Array<{ inlineData?: { data?: string } }>; + }; + }>; + }; + const data = payload?.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data?.trim(); + if (!data) { + throw new Error("Gemini TTS response missing audio data"); + } + return Buffer.from(data, "base64"); + } finally { + clearTimeout(timeout); + } +} + +async function transcodePcmWithFfmpeg(params: { + pcm: Buffer; + format: "mp3" | "opus"; + timeoutMs: number; +}): Promise { + const { pcm, format, timeoutMs } = params; + const args = [ + "-hide_banner", + "-loglevel", + "error", + "-f", + "s16le", + "-ar", + "24000", + "-ac", + "1", + "-i", + "pipe:0", + ]; + + if (format === "opus") { + args.push("-f", "opus", "-c:a", "libopus", "-b:a", "64k", "-ar", "48000", "-ac", "1"); + } else { + args.push("-f", "mp3", "-b:a", "128k", "-ar", "44100", "-ac", "1"); + } + + args.push("pipe:1"); + + return await new Promise((resolve, reject) => { + const child = spawn("ffmpeg", args, { + stdio: ["pipe", "pipe", "pipe"], + }); + + const stdoutChunks: Buffer[] = []; + let stderr = ""; + let settled = false; + const timer = setTimeout(() => { + child.kill("SIGKILL"); + }, timeoutMs); + + child.on("error", (err) => { + if (settled) return; + settled = true; + clearTimeout(timer); + reject(err); + }); + + child.stdout?.on("data", (chunk) => { + stdoutChunks.push(Buffer.from(chunk)); + }); + + child.stderr?.on("data", (chunk) => { + stderr += chunk.toString(); + }); + + child.on("close", (code) => { + if (settled) return; + settled = true; + clearTimeout(timer); + if (code !== 0) { + reject(new Error(stderr.trim() || `ffmpeg exited with code ${code ?? "unknown"}`)); + return; + } + resolve(Buffer.concat(stdoutChunks)); + }); + + child.stdin?.write(pcm); + child.stdin?.end(); + }); +} + export async function textToSpeech(params: { text: string; cfg: MoltbotConfig; @@ -1180,7 +1444,31 @@ export async function textToSpeech(params: { } let audioBuffer: Buffer; - if (provider === "elevenlabs") { + if (provider === "gemini") { + const geminiModelOverride = params.overrides?.gemini?.model; + const geminiVoiceOverride = params.overrides?.gemini?.voiceName; + const pcmBuffer = await geminiTTS({ + text: params.text, + apiKey, + model: geminiModelOverride ?? config.gemini.model, + voiceName: geminiVoiceOverride ?? config.gemini.voiceName, + baseUrl: config.gemini.baseUrl, + timeoutMs: config.timeoutMs, + }); + try { + audioBuffer = await transcodePcmWithFfmpeg({ + pcm: pcmBuffer, + format: output.gemini, + timeoutMs: config.timeoutMs, + }); + } catch (err) { + const error = err as NodeJS.ErrnoException; + if (error.code === "ENOENT") { + throw new Error("ffmpeg not found; Gemini TTS requires ffmpeg"); + } + throw err; + } + } else if (provider === "elevenlabs") { const voiceIdOverride = params.overrides?.elevenlabs?.voiceId; const modelIdOverride = params.overrides?.elevenlabs?.modelId; const voiceSettings = { @@ -1228,7 +1516,12 @@ export async function textToSpeech(params: { audioPath, latencyMs, provider, - outputFormat: provider === "openai" ? output.openai : output.elevenlabs, + outputFormat: + provider === "openai" + ? output.openai + : provider === "gemini" + ? output.gemini + : output.elevenlabs, voiceCompatible: output.voiceCompatible, }; } catch (err) { @@ -1473,9 +1766,11 @@ export const _test = { isValidOpenAIModel, OPENAI_TTS_MODELS, OPENAI_TTS_VOICES, + GEMINI_TTS_MODELS, parseTtsDirectives, resolveModelOverridePolicy, summarizeText, resolveOutputFormat, resolveEdgeOutputFormat, + normalizeGeminiBaseUrl, };