diff --git a/src/config/types.hooks.ts b/src/config/types.hooks.ts
index 7ca74605a..ee9e23b83 100644
--- a/src/config/types.hooks.ts
+++ b/src/config/types.hooks.ts
@@ -67,6 +67,8 @@ export type HooksGmailConfig = {
model?: string;
/** Optional thinking level override for Gmail hook processing. */
thinking?: "off" | "minimal" | "low" | "medium" | "high";
+ /** Sanitise raw HTML email bodies to plain text before injecting into sessions (default: true). */
+ sanitizeBody?: boolean;
};
export type InternalHookHandlerConfig = {
diff --git a/src/config/zod-schema.hooks.ts b/src/config/zod-schema.hooks.ts
index 35e74f7af..32eee4b2b 100644
--- a/src/config/zod-schema.hooks.ts
+++ b/src/config/zod-schema.hooks.ts
@@ -125,6 +125,7 @@ export const HooksGmailSchema = z
z.literal("high"),
])
.optional(),
+ sanitizeBody: z.boolean().optional(),
})
.strict()
.optional();
diff --git a/src/gateway/hooks-mapping.test.ts b/src/gateway/hooks-mapping.test.ts
index 8900ffd07..ece4fcfeb 100644
--- a/src/gateway/hooks-mapping.test.ts
+++ b/src/gateway/hooks-mapping.test.ts
@@ -153,6 +153,62 @@ describe("hooks mapping", () => {
}
});
+ it("sanitises email body HTML for gmail preset by default", async () => {
+ const mappings = resolveHookMappings({ presets: ["gmail"] });
+ const result = await applyHookMappings(mappings, {
+ payload: {
+ messages: [
+ {
+ id: "msg1",
+ from: "test@example.com",
+ subject: "Test",
+ snippet: "Preview",
+ body: "
Hello world
Unsubscribe from our list
",
+ },
+ ],
+ },
+ headers: {},
+ url: baseUrl,
+ path: "gmail",
+ });
+ expect(result?.ok).toBe(true);
+ if (result?.ok && result.action?.kind === "agent") {
+ expect(result.action.message).toContain("Hello world");
+ expect(result.action.message).not.toContain("<");
+ expect(result.action.message).not.toContain("style");
+ expect(result.action.message).not.toMatch(/unsubscribe/i);
+ }
+ });
+
+ it("skips sanitisation when sanitizeBody is false", async () => {
+ const mappings = resolveHookMappings({
+ presets: ["gmail"],
+ gmail: { sanitizeBody: false },
+ });
+ const result = await applyHookMappings(mappings, {
+ payload: {
+ messages: [
+ {
+ id: "msg1",
+ from: "test@example.com",
+ subject: "Test",
+ snippet: "Preview",
+ body: "Hello world
",
+ },
+ ],
+ },
+ headers: {},
+ url: baseUrl,
+ path: "gmail",
+ });
+ expect(result?.ok).toBe(true);
+ if (result?.ok && result.action?.kind === "agent") {
+ // Body should still contain HTML tags
+ expect(result.action.message).toContain("");
+ expect(result.action.message).toContain("");
+ }
+ });
+
it("rejects missing message", async () => {
const mappings = resolveHookMappings({
mappings: [{ match: { path: "noop" }, action: "agent" }],
diff --git a/src/gateway/hooks-mapping.ts b/src/gateway/hooks-mapping.ts
index 2ebf9b136..8dc9d2c68 100644
--- a/src/gateway/hooks-mapping.ts
+++ b/src/gateway/hooks-mapping.ts
@@ -2,6 +2,7 @@ import path from "node:path";
import { pathToFileURL } from "node:url";
import { CONFIG_PATH, type HookMappingConfig, type HooksConfig } from "../config/config.js";
+import { sanitizeEmailBody } from "../hooks/sanitize-email-body.js";
import type { HookMessageChannel } from "./hooks.js";
export type HookMappingResolved = {
@@ -22,6 +23,8 @@ export type HookMappingResolved = {
thinking?: string;
timeoutSeconds?: number;
transform?: HookMappingTransformResolved;
+ /** When true, sanitise HTML email bodies in the payload before template rendering. */
+ sanitizeBody?: boolean;
};
export type HookMappingTransformResolved = {
@@ -103,16 +106,21 @@ type HookTransformFn = (
export function resolveHookMappings(hooks?: HooksConfig): HookMappingResolved[] {
const presets = hooks?.presets ?? [];
const gmailAllowUnsafe = hooks?.gmail?.allowUnsafeExternalContent;
+ // sanitizeBody defaults to true unless explicitly set to false
+ const gmailSanitizeBody = hooks?.gmail?.sanitizeBody !== false;
const mappings: HookMappingConfig[] = [];
if (hooks?.mappings) mappings.push(...hooks.mappings);
for (const preset of presets) {
const presetMappings = hookPresetMappings[preset];
if (!presetMappings) continue;
- if (preset === "gmail" && typeof gmailAllowUnsafe === "boolean") {
+ if (preset === "gmail") {
mappings.push(
...presetMappings.map((mapping) => ({
...mapping,
- allowUnsafeExternalContent: gmailAllowUnsafe,
+ ...(typeof gmailAllowUnsafe === "boolean"
+ ? { allowUnsafeExternalContent: gmailAllowUnsafe }
+ : {}),
+ _sanitizeBody: gmailSanitizeBody,
})),
);
continue;
@@ -137,7 +145,10 @@ export async function applyHookMappings(
for (const mapping of mappings) {
if (!mappingMatches(mapping, ctx)) continue;
- const base = buildActionFromMapping(mapping, ctx);
+ // Sanitise email bodies in-place before template rendering
+ const effectiveCtx = mapping.sanitizeBody ? sanitizePayloadBodies(ctx) : ctx;
+
+ const base = buildActionFromMapping(mapping, effectiveCtx);
if (!base.ok) return base;
let override: HookTransformResult = null;
@@ -174,6 +185,11 @@ function normalizeHookMapping(
}
: undefined;
+ const sanitizeBody =
+ typeof (mapping as Record)._sanitizeBody === "boolean"
+ ? ((mapping as Record)._sanitizeBody as boolean)
+ : undefined;
+
return {
id,
matchPath,
@@ -192,6 +208,7 @@ function normalizeHookMapping(
thinking: mapping.thinking,
timeoutSeconds: mapping.timeoutSeconds,
transform,
+ sanitizeBody,
};
}
@@ -359,6 +376,25 @@ function resolveTemplateExpr(expr: string, ctx: HookMappingContext) {
return getByPath(ctx.payload, expr);
}
+/**
+ * Return a shallow-cloned context whose `messages[].body` fields have been
+ * sanitised from raw HTML to clean plain text. The original context is not
+ * mutated.
+ */
+function sanitizePayloadBodies(ctx: HookMappingContext): HookMappingContext {
+ const messages = ctx.payload.messages;
+ if (!Array.isArray(messages) || messages.length === 0) return ctx;
+
+ const cleaned = messages.map((msg: unknown) => {
+ if (msg === null || typeof msg !== "object") return msg;
+ const rec = msg as Record;
+ if (typeof rec.body !== "string") return msg;
+ return { ...rec, body: sanitizeEmailBody(rec.body) };
+ });
+
+ return { ...ctx, payload: { ...ctx.payload, messages: cleaned } };
+}
+
function getByPath(input: Record, pathExpr: string): unknown {
if (!pathExpr) return undefined;
const parts: Array = [];
diff --git a/src/hooks/sanitize-email-body.test.ts b/src/hooks/sanitize-email-body.test.ts
new file mode 100644
index 000000000..c80ce5339
--- /dev/null
+++ b/src/hooks/sanitize-email-body.test.ts
@@ -0,0 +1,230 @@
+import { describe, expect, it } from "vitest";
+import { sanitizeEmailBody } from "./sanitize-email-body.js";
+
+describe("sanitizeEmailBody", () => {
+ it("returns empty string for falsy input", () => {
+ expect(sanitizeEmailBody("")).toBe("");
+ expect(sanitizeEmailBody(null as unknown as string)).toBe("");
+ expect(sanitizeEmailBody(undefined as unknown as string)).toBe("");
+ });
+
+ it("passes through plain text unchanged", () => {
+ expect(sanitizeEmailBody("Hello world")).toBe("Hello world");
+ });
+
+ // --- HTML stripping ---
+
+ it("strips basic HTML tags", () => {
+ expect(sanitizeEmailBody("Hello world
")).toBe("Hello world");
+ });
+
+ it("removes style blocks and their contents", () => {
+ const html = 'Content
';
+ expect(sanitizeEmailBody(html)).toBe("Content");
+ });
+
+ it("removes script blocks", () => {
+ const html = "Safe
";
+ expect(sanitizeEmailBody(html)).toBe("Safe");
+ });
+
+ it("removes HTML comments", () => {
+ const html = "Visible
";
+ expect(sanitizeEmailBody(html)).toBe("Visible");
+ });
+
+ // --- Newline conversion ---
+
+ it("converts
to newlines", () => {
+ expect(sanitizeEmailBody("Line 1
Line 2
Line 3")).toBe("Line 1\nLine 2\nLine 3");
+ });
+
+ it("converts block-level closing tags to newlines", () => {
+ const html = "Block 1
Block 2
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Block 1");
+ expect(result).toContain("Block 2");
+ expect(result).toMatch(/Block 1\n+Block 2/);
+ });
+
+ // --- HTML entities ---
+
+ it("decodes named HTML entities", () => {
+ expect(sanitizeEmailBody("& < > " ")).toBe('& < > "');
+ });
+
+ it("decodes numeric decimal entities", () => {
+ expect(sanitizeEmailBody("ABC")).toBe("ABC");
+ });
+
+ it("decodes numeric hex entities", () => {
+ expect(sanitizeEmailBody("ABC")).toBe("ABC");
+ });
+
+ it("decodes typographic entities", () => {
+ expect(sanitizeEmailBody("“Hello” — world")).toBe(
+ "\u201CHello\u201D — world",
+ );
+ });
+
+ // --- Data URIs / base64 ---
+
+ it("removes base64 data URIs", () => {
+ const html = '
Text
';
+ expect(sanitizeEmailBody(html)).toBe("Text");
+ });
+
+ it("removes inline base64 content", () => {
+ const html = "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP some text after";
+ const result = sanitizeEmailBody(html);
+ expect(result).not.toContain("base64");
+ expect(result).toContain("some text after");
+ });
+
+ // --- Tracking pixels ---
+
+ it("removes 1x1 tracking pixel images", () => {
+ const html = '
Content
';
+ expect(sanitizeEmailBody(html)).toBe("Content");
+ });
+
+ it("removes display:none images", () => {
+ const html = '
Content
';
+ expect(sanitizeEmailBody(html)).toBe("Content");
+ });
+
+ // --- Links ---
+
+ it("keeps link text, drops tracking hrefs", () => {
+ const html = 'Click here';
+ expect(sanitizeEmailBody(html)).toBe("");
+ });
+
+ it("keeps useful link text", () => {
+ const html = 'Read more';
+ expect(sanitizeEmailBody(html)).toBe("Read more");
+ });
+
+ // --- Footer patterns ---
+
+ it("removes unsubscribe text", () => {
+ const html = "Real content
Unsubscribe from this mailing list
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Real content");
+ expect(result).not.toMatch(/unsubscribe/i);
+ });
+
+ it("removes 'sent from my iPhone'", () => {
+ const html = "Hey!
Sent from my iPhone
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Hey!");
+ expect(result).not.toMatch(/sent from my iphone/i);
+ });
+
+ it("removes 'Get Outlook for iOS'", () => {
+ const html = "Meeting at 3
Get Outlook for iOS
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Meeting at 3");
+ expect(result).not.toMatch(/get outlook/i);
+ });
+
+ it("removes confidentiality notices", () => {
+ const html =
+ "Actual content
Confidentiality notice: This email is intended solely for the use of the individual to whom it is addressed.
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Actual content");
+ expect(result).not.toMatch(/confidentiality notice/i);
+ });
+
+ it("removes copyright notices", () => {
+ const html = "Content
© 2024 Acme Corp. All rights reserved.
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Content");
+ expect(result).not.toMatch(/all rights reserved/i);
+ });
+
+ it("removes 'you are receiving this email because'", () => {
+ const html =
+ "Newsletter
You are receiving this email because you signed up on our website.
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Newsletter");
+ expect(result).not.toMatch(/you are receiving/i);
+ });
+
+ it("removes privacy policy / terms of service", () => {
+ const html = "Content
Privacy Policy | Terms of Service
";
+ const result = sanitizeEmailBody(html);
+ expect(result).toContain("Content");
+ expect(result).not.toMatch(/privacy policy/i);
+ });
+
+ // --- Whitespace collapsing ---
+
+ it("collapses excessive blank lines to max 2 newlines", () => {
+ const html = "A
\n\n\n\n\nB
";
+ const result = sanitizeEmailBody(html);
+ expect(result).not.toMatch(/\n{3,}/);
+ expect(result).toContain("A");
+ expect(result).toContain("B");
+ });
+
+ it("trims each line and removes leading/trailing whitespace", () => {
+ const html = " Hello
World
";
+ const result = sanitizeEmailBody(html);
+ expect(result).not.toMatch(/^\s/);
+ expect(result).not.toMatch(/\s$/);
+ for (const line of result.split("\n")) {
+ expect(line).toBe(line.trim());
+ }
+ });
+
+ it("collapses multiple spaces to single space", () => {
+ expect(sanitizeEmailBody("Hello world here")).toBe("Hello world here");
+ });
+
+ // --- Real-world-ish email ---
+
+ it("handles a typical marketing email", () => {
+ const html = `
+
+
+
+
+
+
+
+
Big Sale!
+
Save 50% on everything this weekend.
+
Use code: SAVE50
+
Shop Now
+
+
+
+
+ `;
+ const result = sanitizeEmailBody(html);
+
+ // Should keep
+ expect(result).toContain("Big Sale!");
+ expect(result).toContain("Save 50% on everything this weekend.");
+ expect(result).toContain("SAVE50");
+ expect(result).toContain("Shop Now");
+
+ // Should remove
+ expect(result).not.toContain("<");
+ expect(result).not.toContain("style");
+ expect(result).not.toMatch(/tracker/);
+ expect(result).not.toMatch(/unsubscribe/i);
+ expect(result).not.toMatch(/all rights reserved/i);
+ expect(result).not.toMatch(/privacy policy/i);
+ });
+});
diff --git a/src/hooks/sanitize-email-body.ts b/src/hooks/sanitize-email-body.ts
new file mode 100644
index 000000000..1bcea0c5a
--- /dev/null
+++ b/src/hooks/sanitize-email-body.ts
@@ -0,0 +1,178 @@
+/**
+ * Sanitise raw HTML email bodies into clean plain text.
+ *
+ * Strips HTML tags, CSS, scripts, tracking pixels, email footers,
+ * base64 data URIs, and excessive whitespace so the body is compact
+ * and ready for LLM consumption.
+ */
+
+// ---------------------------------------------------------------------------
+// Footer / boilerplate patterns (case-insensitive)
+// ---------------------------------------------------------------------------
+const FOOTER_PATTERNS: RegExp[] = [
+ // Unsubscribe / manage preferences
+ /unsubscribe\b.{0,200}/gi,
+ /manage\s+(?:your\s+)?(?:email\s+)?preferences?.{0,100}/gi,
+ /opt[\s-]?out\b.{0,100}/gi,
+ /email\s+preferences?.{0,80}/gi,
+ /update\s+(?:your\s+)?subscription.{0,80}/gi,
+ /you\s+(?:are\s+)?receiv(?:ed?|ing)\s+this\s+(?:email|message)\s+because.{0,300}/gi,
+ /this\s+(?:email|message)\s+was\s+sent\s+(?:to|by).{0,200}/gi,
+ /if\s+you\s+no\s+longer\s+wish\s+to\s+receive.{0,200}/gi,
+ /to\s+stop\s+receiving\s+these\s+(?:emails|notifications|messages).{0,200}/gi,
+
+ // "Sent from" signatures
+ /sent\s+from\s+(?:my\s+)?(?:iphone|ipad|galaxy|android|samsung|pixel|outlook|mail).{0,60}/gi,
+ /get\s+outlook\s+for\s+(?:ios|android).{0,40}/gi,
+
+ // Privacy / legal
+ /this\s+(?:email|message)\s+(?:and\s+any\s+attachments?\s+)?(?:is|are)\s+(?:intended\s+)?(?:solely\s+)?(?:for\s+the\s+use\s+of).{0,500}/gi,
+ /confidential(?:ity)?\s+notice.{0,400}/gi,
+ /disclaimer:.{0,400}/gi,
+ /©\s*\d{4}.{0,120}/gi,
+ /all\s+rights\s+reserved\.?/gi,
+ /privacy\s+policy/gi,
+ /terms\s+(?:of\s+(?:service|use)|and\s+conditions)/gi,
+];
+
+// ---------------------------------------------------------------------------
+// Core sanitiser
+// ---------------------------------------------------------------------------
+
+export function sanitizeEmailBody(html: string): string {
+ if (!html || typeof html !== "string") return "";
+
+ let text = html;
+
+ // 1. Remove