TTS: gate auto audio on inbound audio
This commit is contained in:
parent
cdceff2284
commit
d25fc7aa2b
@ -1540,6 +1540,7 @@ voice notes; other channels send MP3 audio.
|
|||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
- `messages.tts.enabled` can be overridden by local user prefs (see `/tts on`, `/tts off`).
|
- `messages.tts.enabled` can be overridden by local user prefs (see `/tts on`, `/tts off`).
|
||||||
|
- `onlyWhenInboundAudio` limits auto-TTS to replies where the last inbound message includes audio/voice.
|
||||||
- `prefsPath` stores local overrides (enabled/provider/limit/summarize).
|
- `prefsPath` stores local overrides (enabled/provider/limit/summarize).
|
||||||
- `maxTextLength` is a hard cap for TTS input; summaries are truncated to fit.
|
- `maxTextLength` is a hard cap for TTS input; summaries are truncated to fit.
|
||||||
- `summaryModel` overrides `agents.defaults.model.primary` for auto-summary.
|
- `summaryModel` overrides `agents.defaults.model.primary` for auto-summary.
|
||||||
|
|||||||
14
docs/tts.md
14
docs/tts.md
@ -165,6 +165,19 @@ Full schema is in [Gateway configuration](/gateway/configuration).
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Only reply with audio after an inbound voice note
|
||||||
|
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
messages: {
|
||||||
|
tts: {
|
||||||
|
enabled: true,
|
||||||
|
onlyWhenInboundAudio: true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### Disable auto-summary for long replies
|
### Disable auto-summary for long replies
|
||||||
|
|
||||||
```json5
|
```json5
|
||||||
@ -186,6 +199,7 @@ Then run:
|
|||||||
### Notes on fields
|
### Notes on fields
|
||||||
|
|
||||||
- `enabled`: master toggle (default `false`; local prefs can override).
|
- `enabled`: master toggle (default `false`; local prefs can override).
|
||||||
|
- `onlyWhenInboundAudio`: only auto-send TTS when the last inbound message includes audio/voice.
|
||||||
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
|
- `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
|
||||||
- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
|
- `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
|
||||||
- If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key),
|
- If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key),
|
||||||
|
|||||||
@ -21,6 +21,35 @@ export type DispatchFromConfigResult = {
|
|||||||
counts: Record<ReplyDispatchKind, number>;
|
counts: Record<ReplyDispatchKind, number>;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const AUDIO_PLACEHOLDER_RE = /^<media:audio>(\s*\([^)]*\))?$/i;
|
||||||
|
const AUDIO_HEADER_RE = /^\[Audio\]/i;
|
||||||
|
|
||||||
|
const normalizeMediaType = (value: string): string => value.split(";")[0]?.trim().toLowerCase();
|
||||||
|
|
||||||
|
const isInboundAudioContext = (ctx: FinalizedMsgContext): boolean => {
|
||||||
|
const rawTypes = [
|
||||||
|
typeof ctx.MediaType === "string" ? ctx.MediaType : undefined,
|
||||||
|
...(Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : []),
|
||||||
|
].filter(Boolean) as string[];
|
||||||
|
const types = rawTypes.map((type) => normalizeMediaType(type));
|
||||||
|
if (types.some((type) => type === "audio" || type.startsWith("audio/"))) return true;
|
||||||
|
|
||||||
|
const body =
|
||||||
|
typeof ctx.BodyForCommands === "string"
|
||||||
|
? ctx.BodyForCommands
|
||||||
|
: typeof ctx.CommandBody === "string"
|
||||||
|
? ctx.CommandBody
|
||||||
|
: typeof ctx.RawBody === "string"
|
||||||
|
? ctx.RawBody
|
||||||
|
: typeof ctx.Body === "string"
|
||||||
|
? ctx.Body
|
||||||
|
: "";
|
||||||
|
const trimmed = body.trim();
|
||||||
|
if (!trimmed) return false;
|
||||||
|
if (AUDIO_PLACEHOLDER_RE.test(trimmed)) return true;
|
||||||
|
return AUDIO_HEADER_RE.test(trimmed);
|
||||||
|
};
|
||||||
|
|
||||||
export async function dispatchReplyFromConfig(params: {
|
export async function dispatchReplyFromConfig(params: {
|
||||||
ctx: FinalizedMsgContext;
|
ctx: FinalizedMsgContext;
|
||||||
cfg: ClawdbotConfig;
|
cfg: ClawdbotConfig;
|
||||||
@ -81,6 +110,8 @@ export async function dispatchReplyFromConfig(params: {
|
|||||||
return { queuedFinal: false, counts: dispatcher.getQueuedCounts() };
|
return { queuedFinal: false, counts: dispatcher.getQueuedCounts() };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const inboundAudio = isInboundAudioContext(ctx);
|
||||||
|
|
||||||
const hookRunner = getGlobalHookRunner();
|
const hookRunner = getGlobalHookRunner();
|
||||||
if (hookRunner?.hasHooks("message_received")) {
|
if (hookRunner?.hasHooks("message_received")) {
|
||||||
const timestamp =
|
const timestamp =
|
||||||
@ -223,6 +254,7 @@ export async function dispatchReplyFromConfig(params: {
|
|||||||
cfg,
|
cfg,
|
||||||
channel: ttsChannel,
|
channel: ttsChannel,
|
||||||
kind: "tool",
|
kind: "tool",
|
||||||
|
inboundAudio,
|
||||||
});
|
});
|
||||||
if (shouldRouteToOriginating) {
|
if (shouldRouteToOriginating) {
|
||||||
await sendPayloadAsync(ttsPayload);
|
await sendPayloadAsync(ttsPayload);
|
||||||
@ -239,6 +271,7 @@ export async function dispatchReplyFromConfig(params: {
|
|||||||
cfg,
|
cfg,
|
||||||
channel: ttsChannel,
|
channel: ttsChannel,
|
||||||
kind: "block",
|
kind: "block",
|
||||||
|
inboundAudio,
|
||||||
});
|
});
|
||||||
if (shouldRouteToOriginating) {
|
if (shouldRouteToOriginating) {
|
||||||
await sendPayloadAsync(ttsPayload, context?.abortSignal);
|
await sendPayloadAsync(ttsPayload, context?.abortSignal);
|
||||||
@ -262,6 +295,7 @@ export async function dispatchReplyFromConfig(params: {
|
|||||||
cfg,
|
cfg,
|
||||||
channel: ttsChannel,
|
channel: ttsChannel,
|
||||||
kind: "final",
|
kind: "final",
|
||||||
|
inboundAudio,
|
||||||
});
|
});
|
||||||
if (shouldRouteToOriginating && originatingChannel && originatingTo) {
|
if (shouldRouteToOriginating && originatingChannel && originatingTo) {
|
||||||
// Route final reply to originating channel.
|
// Route final reply to originating channel.
|
||||||
|
|||||||
@ -24,6 +24,8 @@ export type TtsModelOverrideConfig = {
|
|||||||
export type TtsConfig = {
|
export type TtsConfig = {
|
||||||
/** Enable auto-TTS (can be overridden by local prefs). */
|
/** Enable auto-TTS (can be overridden by local prefs). */
|
||||||
enabled?: boolean;
|
enabled?: boolean;
|
||||||
|
/** Only run auto-TTS when the last inbound message includes audio. */
|
||||||
|
onlyWhenInboundAudio?: boolean;
|
||||||
/** Apply TTS to final replies only or to all replies (tool/block/final). */
|
/** Apply TTS to final replies only or to all replies (tool/block/final). */
|
||||||
mode?: TtsMode;
|
mode?: TtsMode;
|
||||||
/** Primary TTS provider (fallbacks are automatic). */
|
/** Primary TTS provider (fallbacks are automatic). */
|
||||||
|
|||||||
@ -161,6 +161,7 @@ export const TtsModeSchema = z.enum(["final", "all"]);
|
|||||||
export const TtsConfigSchema = z
|
export const TtsConfigSchema = z
|
||||||
.object({
|
.object({
|
||||||
enabled: z.boolean().optional(),
|
enabled: z.boolean().optional(),
|
||||||
|
onlyWhenInboundAudio: z.boolean().optional(),
|
||||||
mode: TtsModeSchema.optional(),
|
mode: TtsModeSchema.optional(),
|
||||||
provider: TtsProviderSchema.optional(),
|
provider: TtsProviderSchema.optional(),
|
||||||
summaryModel: z.string().optional(),
|
summaryModel: z.string().optional(),
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai";
|
|||||||
|
|
||||||
import { getApiKeyForModel } from "../agents/model-auth.js";
|
import { getApiKeyForModel } from "../agents/model-auth.js";
|
||||||
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
import { resolveModel } from "../agents/pi-embedded-runner/model.js";
|
||||||
import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js";
|
import * as tts from "./tts.js";
|
||||||
|
|
||||||
vi.mock("@mariozechner/pi-ai", () => ({
|
vi.mock("@mariozechner/pi-ai", () => ({
|
||||||
completeSimple: vi.fn(),
|
completeSimple: vi.fn(),
|
||||||
@ -37,6 +37,8 @@ vi.mock("../agents/model-auth.js", () => ({
|
|||||||
requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""),
|
requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""),
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
const { _test, resolveTtsConfig, maybeApplyTtsToPayload, getTtsProvider } = tts;
|
||||||
|
|
||||||
const {
|
const {
|
||||||
isValidVoiceId,
|
isValidVoiceId,
|
||||||
isValidOpenAIVoice,
|
isValidOpenAIVoice,
|
||||||
@ -431,4 +433,55 @@ describe("tts", () => {
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("maybeApplyTtsToPayload", () => {
|
||||||
|
const baseCfg = {
|
||||||
|
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
|
||||||
|
messages: { tts: { enabled: true, onlyWhenInboundAudio: true } },
|
||||||
|
};
|
||||||
|
|
||||||
|
it("skips auto-TTS when inbound audio gating is on and the message is not audio", async () => {
|
||||||
|
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||||
|
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||||
|
const spy = vi.spyOn(tts, "textToSpeech").mockResolvedValue({
|
||||||
|
success: false,
|
||||||
|
error: "nope",
|
||||||
|
});
|
||||||
|
|
||||||
|
const payload = { text: "Hello world" };
|
||||||
|
const result = await maybeApplyTtsToPayload({
|
||||||
|
payload,
|
||||||
|
cfg: baseCfg,
|
||||||
|
kind: "final",
|
||||||
|
inboundAudio: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result).toBe(payload);
|
||||||
|
expect(spy).not.toHaveBeenCalled();
|
||||||
|
|
||||||
|
spy.mockRestore();
|
||||||
|
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||||
|
});
|
||||||
|
|
||||||
|
it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => {
|
||||||
|
const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
|
||||||
|
process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
|
||||||
|
const spy = vi.spyOn(tts, "textToSpeech").mockResolvedValue({
|
||||||
|
success: false,
|
||||||
|
error: "nope",
|
||||||
|
});
|
||||||
|
|
||||||
|
await maybeApplyTtsToPayload({
|
||||||
|
payload: { text: "Hello world" },
|
||||||
|
cfg: baseCfg,
|
||||||
|
kind: "final",
|
||||||
|
inboundAudio: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(spy).toHaveBeenCalledTimes(1);
|
||||||
|
|
||||||
|
spy.mockRestore();
|
||||||
|
process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@ -77,6 +77,7 @@ const DEFAULT_OUTPUT = {
|
|||||||
|
|
||||||
export type ResolvedTtsConfig = {
|
export type ResolvedTtsConfig = {
|
||||||
enabled: boolean;
|
enabled: boolean;
|
||||||
|
onlyWhenInboundAudio: boolean;
|
||||||
mode: TtsMode;
|
mode: TtsMode;
|
||||||
provider: TtsProvider;
|
provider: TtsProvider;
|
||||||
providerSource: "config" | "default";
|
providerSource: "config" | "default";
|
||||||
@ -222,6 +223,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
|
|||||||
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
|
const edgeOutputFormat = raw.edge?.outputFormat?.trim();
|
||||||
return {
|
return {
|
||||||
enabled: raw.enabled ?? false,
|
enabled: raw.enabled ?? false,
|
||||||
|
onlyWhenInboundAudio: raw.onlyWhenInboundAudio ?? false,
|
||||||
mode: raw.mode ?? "final",
|
mode: raw.mode ?? "final",
|
||||||
provider: raw.provider ?? "edge",
|
provider: raw.provider ?? "edge",
|
||||||
providerSource,
|
providerSource,
|
||||||
@ -285,11 +287,17 @@ export function buildTtsSystemPromptHint(cfg: ClawdbotConfig): string | undefine
|
|||||||
if (!isTtsEnabled(config, prefsPath)) return undefined;
|
if (!isTtsEnabled(config, prefsPath)) return undefined;
|
||||||
const maxLength = getTtsMaxLength(prefsPath);
|
const maxLength = getTtsMaxLength(prefsPath);
|
||||||
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
|
const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
|
||||||
|
const inboundAudioHint = config.onlyWhenInboundAudio
|
||||||
|
? "Only use TTS when the user's last message includes audio/voice."
|
||||||
|
: undefined;
|
||||||
return [
|
return [
|
||||||
"Voice (TTS) is enabled.",
|
"Voice (TTS) is enabled.",
|
||||||
|
inboundAudioHint,
|
||||||
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
|
`Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
|
||||||
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
|
"Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
|
||||||
].join("\n");
|
]
|
||||||
|
.filter(Boolean)
|
||||||
|
.join("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
function readPrefs(prefsPath: string): TtsUserPrefs {
|
function readPrefs(prefsPath: string): TtsUserPrefs {
|
||||||
@ -1156,10 +1164,12 @@ export async function maybeApplyTtsToPayload(params: {
|
|||||||
cfg: ClawdbotConfig;
|
cfg: ClawdbotConfig;
|
||||||
channel?: string;
|
channel?: string;
|
||||||
kind?: "tool" | "block" | "final";
|
kind?: "tool" | "block" | "final";
|
||||||
|
inboundAudio?: boolean;
|
||||||
}): Promise<ReplyPayload> {
|
}): Promise<ReplyPayload> {
|
||||||
const config = resolveTtsConfig(params.cfg);
|
const config = resolveTtsConfig(params.cfg);
|
||||||
const prefsPath = resolveTtsPrefsPath(config);
|
const prefsPath = resolveTtsPrefsPath(config);
|
||||||
if (!isTtsEnabled(config, prefsPath)) return params.payload;
|
if (!isTtsEnabled(config, prefsPath)) return params.payload;
|
||||||
|
if (config.onlyWhenInboundAudio && params.inboundAudio !== true) return params.payload;
|
||||||
|
|
||||||
const mode = config.mode ?? "final";
|
const mode = config.mode ?? "final";
|
||||||
if (mode === "final" && params.kind && params.kind !== "final") return params.payload;
|
if (mode === "final" && params.kind && params.kind !== "final") return params.payload;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user