diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index 1d4e95cb0..2fd90cf30 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -1540,6 +1540,7 @@ voice notes; other channels send MP3 audio. Notes: - `messages.tts.enabled` can be overridden by local user prefs (see `/tts on`, `/tts off`). +- `onlyWhenInboundAudio` limits auto-TTS to replies where the last inbound message includes audio/voice. - `prefsPath` stores local overrides (enabled/provider/limit/summarize). - `maxTextLength` is a hard cap for TTS input; summaries are truncated to fit. - `summaryModel` overrides `agents.defaults.model.primary` for auto-summary. diff --git a/docs/tts.md b/docs/tts.md index 61da1f0dc..dc6576154 100644 --- a/docs/tts.md +++ b/docs/tts.md @@ -165,6 +165,19 @@ Full schema is in [Gateway configuration](/gateway/configuration). } ``` +### Only reply with audio after an inbound voice note + +```json5 +{ + messages: { + tts: { + enabled: true, + onlyWhenInboundAudio: true + } + } +} +``` + ### Disable auto-summary for long replies ```json5 @@ -186,6 +199,7 @@ Then run: ### Notes on fields - `enabled`: master toggle (default `false`; local prefs can override). +- `onlyWhenInboundAudio`: only auto-send TTS when the last inbound message includes audio/voice. - `mode`: `"final"` (default) or `"all"` (includes tool/block replies). - `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic). - If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key), diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts index 5885d729e..d474ccb93 100644 --- a/src/auto-reply/reply/dispatch-from-config.ts +++ b/src/auto-reply/reply/dispatch-from-config.ts @@ -21,6 +21,35 @@ export type DispatchFromConfigResult = { counts: Record; }; +const AUDIO_PLACEHOLDER_RE = /^(\s*\([^)]*\))?$/i; +const AUDIO_HEADER_RE = /^\[Audio\]/i; + +const normalizeMediaType = (value: string): string => value.split(";")[0]?.trim().toLowerCase(); + +const isInboundAudioContext = (ctx: FinalizedMsgContext): boolean => { + const rawTypes = [ + typeof ctx.MediaType === "string" ? ctx.MediaType : undefined, + ...(Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : []), + ].filter(Boolean) as string[]; + const types = rawTypes.map((type) => normalizeMediaType(type)); + if (types.some((type) => type === "audio" || type.startsWith("audio/"))) return true; + + const body = + typeof ctx.BodyForCommands === "string" + ? ctx.BodyForCommands + : typeof ctx.CommandBody === "string" + ? ctx.CommandBody + : typeof ctx.RawBody === "string" + ? ctx.RawBody + : typeof ctx.Body === "string" + ? ctx.Body + : ""; + const trimmed = body.trim(); + if (!trimmed) return false; + if (AUDIO_PLACEHOLDER_RE.test(trimmed)) return true; + return AUDIO_HEADER_RE.test(trimmed); +}; + export async function dispatchReplyFromConfig(params: { ctx: FinalizedMsgContext; cfg: ClawdbotConfig; @@ -81,6 +110,8 @@ export async function dispatchReplyFromConfig(params: { return { queuedFinal: false, counts: dispatcher.getQueuedCounts() }; } + const inboundAudio = isInboundAudioContext(ctx); + const hookRunner = getGlobalHookRunner(); if (hookRunner?.hasHooks("message_received")) { const timestamp = @@ -223,6 +254,7 @@ export async function dispatchReplyFromConfig(params: { cfg, channel: ttsChannel, kind: "tool", + inboundAudio, }); if (shouldRouteToOriginating) { await sendPayloadAsync(ttsPayload); @@ -239,6 +271,7 @@ export async function dispatchReplyFromConfig(params: { cfg, channel: ttsChannel, kind: "block", + inboundAudio, }); if (shouldRouteToOriginating) { await sendPayloadAsync(ttsPayload, context?.abortSignal); @@ -262,6 +295,7 @@ export async function dispatchReplyFromConfig(params: { cfg, channel: ttsChannel, kind: "final", + inboundAudio, }); if (shouldRouteToOriginating && originatingChannel && originatingTo) { // Route final reply to originating channel. diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index 28b65c96d..fe340d468 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -24,6 +24,8 @@ export type TtsModelOverrideConfig = { export type TtsConfig = { /** Enable auto-TTS (can be overridden by local prefs). */ enabled?: boolean; + /** Only run auto-TTS when the last inbound message includes audio. */ + onlyWhenInboundAudio?: boolean; /** Apply TTS to final replies only or to all replies (tool/block/final). */ mode?: TtsMode; /** Primary TTS provider (fallbacks are automatic). */ diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index bcf769b67..b8c1d8f4d 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -161,6 +161,7 @@ export const TtsModeSchema = z.enum(["final", "all"]); export const TtsConfigSchema = z .object({ enabled: z.boolean().optional(), + onlyWhenInboundAudio: z.boolean().optional(), mode: TtsModeSchema.optional(), provider: TtsProviderSchema.optional(), summaryModel: z.string().optional(), diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index fafe3bbdf..73c38a9e4 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai"; import { getApiKeyForModel } from "../agents/model-auth.js"; import { resolveModel } from "../agents/pi-embedded-runner/model.js"; -import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js"; +import * as tts from "./tts.js"; vi.mock("@mariozechner/pi-ai", () => ({ completeSimple: vi.fn(), @@ -37,6 +37,8 @@ vi.mock("../agents/model-auth.js", () => ({ requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""), })); +const { _test, resolveTtsConfig, maybeApplyTtsToPayload, getTtsProvider } = tts; + const { isValidVoiceId, isValidOpenAIVoice, @@ -431,4 +433,55 @@ describe("tts", () => { ); }); }); + + describe("maybeApplyTtsToPayload", () => { + const baseCfg = { + agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, + messages: { tts: { enabled: true, onlyWhenInboundAudio: true } }, + }; + + it("skips auto-TTS when inbound audio gating is on and the message is not audio", async () => { + const prevPrefs = process.env.CLAWDBOT_TTS_PREFS; + process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; + const spy = vi.spyOn(tts, "textToSpeech").mockResolvedValue({ + success: false, + error: "nope", + }); + + const payload = { text: "Hello world" }; + const result = await maybeApplyTtsToPayload({ + payload, + cfg: baseCfg, + kind: "final", + inboundAudio: false, + }); + + expect(result).toBe(payload); + expect(spy).not.toHaveBeenCalled(); + + spy.mockRestore(); + process.env.CLAWDBOT_TTS_PREFS = prevPrefs; + }); + + it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => { + const prevPrefs = process.env.CLAWDBOT_TTS_PREFS; + process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`; + const spy = vi.spyOn(tts, "textToSpeech").mockResolvedValue({ + success: false, + error: "nope", + }); + + await maybeApplyTtsToPayload({ + payload: { text: "Hello world" }, + cfg: baseCfg, + kind: "final", + inboundAudio: true, + }); + + expect(spy).toHaveBeenCalledTimes(1); + + spy.mockRestore(); + process.env.CLAWDBOT_TTS_PREFS = prevPrefs; + }); + }); }); diff --git a/src/tts/tts.ts b/src/tts/tts.ts index cf2823f95..2a7568cbf 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -77,6 +77,7 @@ const DEFAULT_OUTPUT = { export type ResolvedTtsConfig = { enabled: boolean; + onlyWhenInboundAudio: boolean; mode: TtsMode; provider: TtsProvider; providerSource: "config" | "default"; @@ -222,6 +223,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig { const edgeOutputFormat = raw.edge?.outputFormat?.trim(); return { enabled: raw.enabled ?? false, + onlyWhenInboundAudio: raw.onlyWhenInboundAudio ?? false, mode: raw.mode ?? "final", provider: raw.provider ?? "edge", providerSource, @@ -285,11 +287,17 @@ export function buildTtsSystemPromptHint(cfg: ClawdbotConfig): string | undefine if (!isTtsEnabled(config, prefsPath)) return undefined; const maxLength = getTtsMaxLength(prefsPath); const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off"; + const inboundAudioHint = config.onlyWhenInboundAudio + ? "Only use TTS when the user's last message includes audio/voice." + : undefined; return [ "Voice (TTS) is enabled.", + inboundAudioHint, `Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`, "Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.", - ].join("\n"); + ] + .filter(Boolean) + .join("\n"); } function readPrefs(prefsPath: string): TtsUserPrefs { @@ -1156,10 +1164,12 @@ export async function maybeApplyTtsToPayload(params: { cfg: ClawdbotConfig; channel?: string; kind?: "tool" | "block" | "final"; + inboundAudio?: boolean; }): Promise { const config = resolveTtsConfig(params.cfg); const prefsPath = resolveTtsPrefsPath(config); if (!isTtsEnabled(config, prefsPath)) return params.payload; + if (config.onlyWhenInboundAudio && params.inboundAudio !== true) return params.payload; const mode = config.mode ?? "final"; if (mode === "final" && params.kind && params.kind !== "final") return params.payload;