diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md
index 1d4e95cb0..2fd90cf30 100644
--- a/docs/gateway/configuration.md
+++ b/docs/gateway/configuration.md
@@ -1540,6 +1540,7 @@ voice notes; other channels send MP3 audio.
 
 Notes:
 - `messages.tts.enabled` can be overridden by local user prefs (see `/tts on`, `/tts off`).
+- `onlyWhenInboundAudio` limits auto-TTS to replies where the last inbound message includes audio/voice.
 - `prefsPath` stores local overrides (enabled/provider/limit/summarize).
 - `maxTextLength` is a hard cap for TTS input; summaries are truncated to fit.
 - `summaryModel` overrides `agents.defaults.model.primary` for auto-summary.
diff --git a/docs/tts.md b/docs/tts.md
index 61da1f0dc..dc6576154 100644
--- a/docs/tts.md
+++ b/docs/tts.md
@@ -165,6 +165,19 @@ Full schema is in [Gateway configuration](/gateway/configuration).
 }
 ```
 
+### Only reply with audio after an inbound voice note
+
+```json5
+{
+  messages: {
+    tts: {
+      enabled: true,
+      onlyWhenInboundAudio: true
+    }
+  }
+}
+```
+
 ### Disable auto-summary for long replies
 
 ```json5
@@ -186,6 +199,7 @@ Then run:
 ### Notes on fields
 
 - `enabled`: master toggle (default `false`; local prefs can override).
+- `onlyWhenInboundAudio`: only auto-send TTS when the last inbound message includes audio/voice.
 - `mode`: `"final"` (default) or `"all"` (includes tool/block replies).
 - `provider`: `"elevenlabs"`, `"openai"`, or `"edge"` (fallback is automatic).
 - If `provider` is **unset**, Clawdbot prefers `openai` (if key), then `elevenlabs` (if key),
diff --git a/src/auto-reply/reply/dispatch-from-config.ts b/src/auto-reply/reply/dispatch-from-config.ts
index 5885d729e..d474ccb93 100644
--- a/src/auto-reply/reply/dispatch-from-config.ts
+++ b/src/auto-reply/reply/dispatch-from-config.ts
@@ -21,6 +21,35 @@ export type DispatchFromConfigResult = {
   counts: Record<ReplyDispatchKind, number>;
 };
 
+const AUDIO_PLACEHOLDER_RE = /^<media:audio>(\s*\([^)]*\))?$/i;
+const AUDIO_HEADER_RE = /^\[Audio\]/i;
+
+const normalizeMediaType = (value: string): string => value.split(";")[0]?.trim().toLowerCase();
+
+const isInboundAudioContext = (ctx: FinalizedMsgContext): boolean => {
+  const rawTypes = [
+    typeof ctx.MediaType === "string" ? ctx.MediaType : undefined,
+    ...(Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : []),
+  ].filter(Boolean) as string[];
+  const types = rawTypes.map((type) => normalizeMediaType(type));
+  if (types.some((type) => type === "audio" || type.startsWith("audio/"))) return true;
+
+  const body =
+    typeof ctx.BodyForCommands === "string"
+      ? ctx.BodyForCommands
+      : typeof ctx.CommandBody === "string"
+        ? ctx.CommandBody
+        : typeof ctx.RawBody === "string"
+          ? ctx.RawBody
+          : typeof ctx.Body === "string"
+            ? ctx.Body
+            : "";
+  const trimmed = body.trim();
+  if (!trimmed) return false;
+  if (AUDIO_PLACEHOLDER_RE.test(trimmed)) return true;
+  return AUDIO_HEADER_RE.test(trimmed);
+};
+
 export async function dispatchReplyFromConfig(params: {
   ctx: FinalizedMsgContext;
   cfg: ClawdbotConfig;
@@ -81,6 +110,8 @@ export async function dispatchReplyFromConfig(params: {
     return { queuedFinal: false, counts: dispatcher.getQueuedCounts() };
   }
 
+  const inboundAudio = isInboundAudioContext(ctx);
+
   const hookRunner = getGlobalHookRunner();
   if (hookRunner?.hasHooks("message_received")) {
     const timestamp =
@@ -223,6 +254,7 @@ export async function dispatchReplyFromConfig(params: {
               cfg,
               channel: ttsChannel,
               kind: "tool",
+              inboundAudio,
             });
             if (shouldRouteToOriginating) {
               await sendPayloadAsync(ttsPayload);
@@ -239,6 +271,7 @@ export async function dispatchReplyFromConfig(params: {
               cfg,
               channel: ttsChannel,
               kind: "block",
+              inboundAudio,
             });
             if (shouldRouteToOriginating) {
               await sendPayloadAsync(ttsPayload, context?.abortSignal);
@@ -262,6 +295,7 @@ export async function dispatchReplyFromConfig(params: {
         cfg,
         channel: ttsChannel,
         kind: "final",
+        inboundAudio,
       });
       if (shouldRouteToOriginating && originatingChannel && originatingTo) {
         // Route final reply to originating channel.
diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts
index 28b65c96d..fe340d468 100644
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@@ -24,6 +24,8 @@ export type TtsModelOverrideConfig = {
 export type TtsConfig = {
   /** Enable auto-TTS (can be overridden by local prefs). */
   enabled?: boolean;
+  /** Only run auto-TTS when the last inbound message includes audio. */
+  onlyWhenInboundAudio?: boolean;
   /** Apply TTS to final replies only or to all replies (tool/block/final). */
   mode?: TtsMode;
   /** Primary TTS provider (fallbacks are automatic). */
diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts
index bcf769b67..b8c1d8f4d 100644
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -161,6 +161,7 @@ export const TtsModeSchema = z.enum(["final", "all"]);
 export const TtsConfigSchema = z
   .object({
     enabled: z.boolean().optional(),
+    onlyWhenInboundAudio: z.boolean().optional(),
     mode: TtsModeSchema.optional(),
     provider: TtsProviderSchema.optional(),
     summaryModel: z.string().optional(),
diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts
index fafe3bbdf..73c38a9e4 100644
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@@ -4,7 +4,7 @@ import { completeSimple } from "@mariozechner/pi-ai";
 
 import { getApiKeyForModel } from "../agents/model-auth.js";
 import { resolveModel } from "../agents/pi-embedded-runner/model.js";
-import { _test, getTtsProvider, resolveTtsConfig } from "./tts.js";
+import * as tts from "./tts.js";
 
 vi.mock("@mariozechner/pi-ai", () => ({
   completeSimple: vi.fn(),
@@ -37,6 +37,8 @@ vi.mock("../agents/model-auth.js", () => ({
   requireApiKey: vi.fn((auth: { apiKey?: string }) => auth.apiKey ?? ""),
 }));
 
+const { _test, resolveTtsConfig, maybeApplyTtsToPayload, getTtsProvider } = tts;
+
 const {
   isValidVoiceId,
   isValidOpenAIVoice,
@@ -431,4 +433,55 @@ describe("tts", () => {
       );
     });
   });
+
+  describe("maybeApplyTtsToPayload", () => {
+    const baseCfg = {
+      agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
+      messages: { tts: { enabled: true, onlyWhenInboundAudio: true } },
+    };
+
+    it("skips auto-TTS when inbound audio gating is on and the message is not audio", async () => {
+      const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
+      process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
+      const spy = vi.spyOn(tts, "textToSpeech").mockResolvedValue({
+        success: false,
+        error: "nope",
+      });
+
+      const payload = { text: "Hello world" };
+      const result = await maybeApplyTtsToPayload({
+        payload,
+        cfg: baseCfg,
+        kind: "final",
+        inboundAudio: false,
+      });
+
+      expect(result).toBe(payload);
+      expect(spy).not.toHaveBeenCalled();
+
+      spy.mockRestore();
+      process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
+    });
+
+    it("attempts auto-TTS when inbound audio gating is on and the message is audio", async () => {
+      const prevPrefs = process.env.CLAWDBOT_TTS_PREFS;
+      process.env.CLAWDBOT_TTS_PREFS = `/tmp/tts-test-${Date.now()}.json`;
+      const spy = vi.spyOn(tts, "textToSpeech").mockResolvedValue({
+        success: false,
+        error: "nope",
+      });
+
+      await maybeApplyTtsToPayload({
+        payload: { text: "Hello world" },
+        cfg: baseCfg,
+        kind: "final",
+        inboundAudio: true,
+      });
+
+      expect(spy).toHaveBeenCalledTimes(1);
+
+      spy.mockRestore();
+      process.env.CLAWDBOT_TTS_PREFS = prevPrefs;
+    });
+  });
 });
diff --git a/src/tts/tts.ts b/src/tts/tts.ts
index cf2823f95..2a7568cbf 100644
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -77,6 +77,7 @@ const DEFAULT_OUTPUT = {
 
 export type ResolvedTtsConfig = {
   enabled: boolean;
+  onlyWhenInboundAudio: boolean;
   mode: TtsMode;
   provider: TtsProvider;
   providerSource: "config" | "default";
@@ -222,6 +223,7 @@ export function resolveTtsConfig(cfg: ClawdbotConfig): ResolvedTtsConfig {
   const edgeOutputFormat = raw.edge?.outputFormat?.trim();
   return {
     enabled: raw.enabled ?? false,
+    onlyWhenInboundAudio: raw.onlyWhenInboundAudio ?? false,
     mode: raw.mode ?? "final",
     provider: raw.provider ?? "edge",
     providerSource,
@@ -285,11 +287,17 @@ export function buildTtsSystemPromptHint(cfg: ClawdbotConfig): string | undefine
   if (!isTtsEnabled(config, prefsPath)) return undefined;
   const maxLength = getTtsMaxLength(prefsPath);
   const summarize = isSummarizationEnabled(prefsPath) ? "on" : "off";
+  const inboundAudioHint = config.onlyWhenInboundAudio
+    ? "Only use TTS when the user's last message includes audio/voice."
+    : undefined;
   return [
     "Voice (TTS) is enabled.",
+    inboundAudioHint,
     `Keep spoken text ≤${maxLength} chars to avoid auto-summary (summary ${summarize}).`,
     "Use [[tts:...]] and optional [[tts:text]]...[[/tts:text]] to control voice/expressiveness.",
-  ].join("\n");
+  ]
+    .filter(Boolean)
+    .join("\n");
 }
 
 function readPrefs(prefsPath: string): TtsUserPrefs {
@@ -1156,10 +1164,12 @@ export async function maybeApplyTtsToPayload(params: {
   cfg: ClawdbotConfig;
   channel?: string;
   kind?: "tool" | "block" | "final";
+  inboundAudio?: boolean;
 }): Promise<ReplyPayload> {
   const config = resolveTtsConfig(params.cfg);
   const prefsPath = resolveTtsPrefsPath(config);
   if (!isTtsEnabled(config, prefsPath)) return params.payload;
+  if (config.onlyWhenInboundAudio && params.inboundAudio !== true) return params.payload;
 
   const mode = config.mode ?? "final";
   if (mode === "final" && params.kind && params.kind !== "final") return params.payload;