From 52c584c26db61d175bd711c1d136831e0e162eeb Mon Sep 17 00:00:00 2001 From: Adam Holt Date: Thu, 29 Jan 2026 23:59:40 +0000 Subject: [PATCH] feat(gmail): sanitise HTML email bodies before template injection Strip raw HTML from email bodies in the Gmail hook pipeline so that clean plain text reaches the agent session instead of bloated HTML with CSS, tracking pixels, footers, and boilerplate. What it does: - Strips HTML tags,

Unsubscribe from our list

", + }, + ], + }, + headers: {}, + url: baseUrl, + path: "gmail", + }); + expect(result?.ok).toBe(true); + if (result?.ok && result.action?.kind === "agent") { + expect(result.action.message).toContain("Hello world"); + expect(result.action.message).not.toContain("<"); + expect(result.action.message).not.toContain("style"); + expect(result.action.message).not.toMatch(/unsubscribe/i); + } + }); + + it("skips sanitisation when sanitizeBody is false", async () => { + const mappings = resolveHookMappings({ + presets: ["gmail"], + gmail: { sanitizeBody: false }, + }); + const result = await applyHookMappings(mappings, { + payload: { + messages: [ + { + id: "msg1", + from: "test@example.com", + subject: "Test", + snippet: "Preview", + body: "

Hello world

", + }, + ], + }, + headers: {}, + url: baseUrl, + path: "gmail", + }); + expect(result?.ok).toBe(true); + if (result?.ok && result.action?.kind === "agent") { + // Body should still contain HTML tags + expect(result.action.message).toContain("

"); + expect(result.action.message).toContain(""); + } + }); + it("rejects missing message", async () => { const mappings = resolveHookMappings({ mappings: [{ match: { path: "noop" }, action: "agent" }], diff --git a/src/gateway/hooks-mapping.ts b/src/gateway/hooks-mapping.ts index 2ebf9b136..8dc9d2c68 100644 --- a/src/gateway/hooks-mapping.ts +++ b/src/gateway/hooks-mapping.ts @@ -2,6 +2,7 @@ import path from "node:path"; import { pathToFileURL } from "node:url"; import { CONFIG_PATH, type HookMappingConfig, type HooksConfig } from "../config/config.js"; +import { sanitizeEmailBody } from "../hooks/sanitize-email-body.js"; import type { HookMessageChannel } from "./hooks.js"; export type HookMappingResolved = { @@ -22,6 +23,8 @@ export type HookMappingResolved = { thinking?: string; timeoutSeconds?: number; transform?: HookMappingTransformResolved; + /** When true, sanitise HTML email bodies in the payload before template rendering. */ + sanitizeBody?: boolean; }; export type HookMappingTransformResolved = { @@ -103,16 +106,21 @@ type HookTransformFn = ( export function resolveHookMappings(hooks?: HooksConfig): HookMappingResolved[] { const presets = hooks?.presets ?? []; const gmailAllowUnsafe = hooks?.gmail?.allowUnsafeExternalContent; + // sanitizeBody defaults to true unless explicitly set to false + const gmailSanitizeBody = hooks?.gmail?.sanitizeBody !== false; const mappings: HookMappingConfig[] = []; if (hooks?.mappings) mappings.push(...hooks.mappings); for (const preset of presets) { const presetMappings = hookPresetMappings[preset]; if (!presetMappings) continue; - if (preset === "gmail" && typeof gmailAllowUnsafe === "boolean") { + if (preset === "gmail") { mappings.push( ...presetMappings.map((mapping) => ({ ...mapping, - allowUnsafeExternalContent: gmailAllowUnsafe, + ...(typeof gmailAllowUnsafe === "boolean" + ? { allowUnsafeExternalContent: gmailAllowUnsafe } + : {}), + _sanitizeBody: gmailSanitizeBody, })), ); continue; @@ -137,7 +145,10 @@ export async function applyHookMappings( for (const mapping of mappings) { if (!mappingMatches(mapping, ctx)) continue; - const base = buildActionFromMapping(mapping, ctx); + // Sanitise email bodies in-place before template rendering + const effectiveCtx = mapping.sanitizeBody ? sanitizePayloadBodies(ctx) : ctx; + + const base = buildActionFromMapping(mapping, effectiveCtx); if (!base.ok) return base; let override: HookTransformResult = null; @@ -174,6 +185,11 @@ function normalizeHookMapping( } : undefined; + const sanitizeBody = + typeof (mapping as Record)._sanitizeBody === "boolean" + ? ((mapping as Record)._sanitizeBody as boolean) + : undefined; + return { id, matchPath, @@ -192,6 +208,7 @@ function normalizeHookMapping( thinking: mapping.thinking, timeoutSeconds: mapping.timeoutSeconds, transform, + sanitizeBody, }; } @@ -359,6 +376,25 @@ function resolveTemplateExpr(expr: string, ctx: HookMappingContext) { return getByPath(ctx.payload, expr); } +/** + * Return a shallow-cloned context whose `messages[].body` fields have been + * sanitised from raw HTML to clean plain text. The original context is not + * mutated. + */ +function sanitizePayloadBodies(ctx: HookMappingContext): HookMappingContext { + const messages = ctx.payload.messages; + if (!Array.isArray(messages) || messages.length === 0) return ctx; + + const cleaned = messages.map((msg: unknown) => { + if (msg === null || typeof msg !== "object") return msg; + const rec = msg as Record; + if (typeof rec.body !== "string") return msg; + return { ...rec, body: sanitizeEmailBody(rec.body) }; + }); + + return { ...ctx, payload: { ...ctx.payload, messages: cleaned } }; +} + function getByPath(input: Record, pathExpr: string): unknown { if (!pathExpr) return undefined; const parts: Array = []; diff --git a/src/hooks/sanitize-email-body.test.ts b/src/hooks/sanitize-email-body.test.ts new file mode 100644 index 000000000..c80ce5339 --- /dev/null +++ b/src/hooks/sanitize-email-body.test.ts @@ -0,0 +1,230 @@ +import { describe, expect, it } from "vitest"; +import { sanitizeEmailBody } from "./sanitize-email-body.js"; + +describe("sanitizeEmailBody", () => { + it("returns empty string for falsy input", () => { + expect(sanitizeEmailBody("")).toBe(""); + expect(sanitizeEmailBody(null as unknown as string)).toBe(""); + expect(sanitizeEmailBody(undefined as unknown as string)).toBe(""); + }); + + it("passes through plain text unchanged", () => { + expect(sanitizeEmailBody("Hello world")).toBe("Hello world"); + }); + + // --- HTML stripping --- + + it("strips basic HTML tags", () => { + expect(sanitizeEmailBody("

Hello world

")).toBe("Hello world"); + }); + + it("removes style blocks and their contents", () => { + const html = '

Content

'; + expect(sanitizeEmailBody(html)).toBe("Content"); + }); + + it("removes script blocks", () => { + const html = "

Safe

"; + expect(sanitizeEmailBody(html)).toBe("Safe"); + }); + + it("removes HTML comments", () => { + const html = "

Visible

"; + expect(sanitizeEmailBody(html)).toBe("Visible"); + }); + + // --- Newline conversion --- + + it("converts
to newlines", () => { + expect(sanitizeEmailBody("Line 1
Line 2
Line 3")).toBe("Line 1\nLine 2\nLine 3"); + }); + + it("converts block-level closing tags to newlines", () => { + const html = "
Block 1
Block 2
"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Block 1"); + expect(result).toContain("Block 2"); + expect(result).toMatch(/Block 1\n+Block 2/); + }); + + // --- HTML entities --- + + it("decodes named HTML entities", () => { + expect(sanitizeEmailBody("& < > "  ")).toBe('& < > "'); + }); + + it("decodes numeric decimal entities", () => { + expect(sanitizeEmailBody("ABC")).toBe("ABC"); + }); + + it("decodes numeric hex entities", () => { + expect(sanitizeEmailBody("ABC")).toBe("ABC"); + }); + + it("decodes typographic entities", () => { + expect(sanitizeEmailBody("“Hello” — world")).toBe( + "\u201CHello\u201D — world", + ); + }); + + // --- Data URIs / base64 --- + + it("removes base64 data URIs", () => { + const html = 'pic

Text

'; + expect(sanitizeEmailBody(html)).toBe("Text"); + }); + + it("removes inline base64 content", () => { + const html = "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP some text after"; + const result = sanitizeEmailBody(html); + expect(result).not.toContain("base64"); + expect(result).toContain("some text after"); + }); + + // --- Tracking pixels --- + + it("removes 1x1 tracking pixel images", () => { + const html = '

Content

'; + expect(sanitizeEmailBody(html)).toBe("Content"); + }); + + it("removes display:none images", () => { + const html = '

Content

'; + expect(sanitizeEmailBody(html)).toBe("Content"); + }); + + // --- Links --- + + it("keeps link text, drops tracking hrefs", () => { + const html = 'Click here'; + expect(sanitizeEmailBody(html)).toBe(""); + }); + + it("keeps useful link text", () => { + const html = 'Read more'; + expect(sanitizeEmailBody(html)).toBe("Read more"); + }); + + // --- Footer patterns --- + + it("removes unsubscribe text", () => { + const html = "

Real content

Unsubscribe from this mailing list

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Real content"); + expect(result).not.toMatch(/unsubscribe/i); + }); + + it("removes 'sent from my iPhone'", () => { + const html = "

Hey!

Sent from my iPhone

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Hey!"); + expect(result).not.toMatch(/sent from my iphone/i); + }); + + it("removes 'Get Outlook for iOS'", () => { + const html = "

Meeting at 3

Get Outlook for iOS

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Meeting at 3"); + expect(result).not.toMatch(/get outlook/i); + }); + + it("removes confidentiality notices", () => { + const html = + "

Actual content

Confidentiality notice: This email is intended solely for the use of the individual to whom it is addressed.

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Actual content"); + expect(result).not.toMatch(/confidentiality notice/i); + }); + + it("removes copyright notices", () => { + const html = "

Content

© 2024 Acme Corp. All rights reserved.

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Content"); + expect(result).not.toMatch(/all rights reserved/i); + }); + + it("removes 'you are receiving this email because'", () => { + const html = + "

Newsletter

You are receiving this email because you signed up on our website.

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Newsletter"); + expect(result).not.toMatch(/you are receiving/i); + }); + + it("removes privacy policy / terms of service", () => { + const html = "

Content

Privacy Policy | Terms of Service

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Content"); + expect(result).not.toMatch(/privacy policy/i); + }); + + // --- Whitespace collapsing --- + + it("collapses excessive blank lines to max 2 newlines", () => { + const html = "

A

\n\n\n\n\n

B

"; + const result = sanitizeEmailBody(html); + expect(result).not.toMatch(/\n{3,}/); + expect(result).toContain("A"); + expect(result).toContain("B"); + }); + + it("trims each line and removes leading/trailing whitespace", () => { + const html = "

Hello

World

"; + const result = sanitizeEmailBody(html); + expect(result).not.toMatch(/^\s/); + expect(result).not.toMatch(/\s$/); + for (const line of result.split("\n")) { + expect(line).toBe(line.trim()); + } + }); + + it("collapses multiple spaces to single space", () => { + expect(sanitizeEmailBody("Hello world here")).toBe("Hello world here"); + }); + + // --- Real-world-ish email --- + + it("handles a typical marketing email", () => { + const html = ` + + + + + +
+ Logo +
+
+

Big Sale!

+

Save 50% on everything this weekend.

+

Use code: SAVE50

+ Shop Now +
+ + + + `; + const result = sanitizeEmailBody(html); + + // Should keep + expect(result).toContain("Big Sale!"); + expect(result).toContain("Save 50% on everything this weekend."); + expect(result).toContain("SAVE50"); + expect(result).toContain("Shop Now"); + + // Should remove + expect(result).not.toContain("<"); + expect(result).not.toContain("style"); + expect(result).not.toMatch(/tracker/); + expect(result).not.toMatch(/unsubscribe/i); + expect(result).not.toMatch(/all rights reserved/i); + expect(result).not.toMatch(/privacy policy/i); + }); +}); diff --git a/src/hooks/sanitize-email-body.ts b/src/hooks/sanitize-email-body.ts new file mode 100644 index 000000000..1bcea0c5a --- /dev/null +++ b/src/hooks/sanitize-email-body.ts @@ -0,0 +1,178 @@ +/** + * Sanitise raw HTML email bodies into clean plain text. + * + * Strips HTML tags, CSS, scripts, tracking pixels, email footers, + * base64 data URIs, and excessive whitespace so the body is compact + * and ready for LLM consumption. + */ + +// --------------------------------------------------------------------------- +// Footer / boilerplate patterns (case-insensitive) +// --------------------------------------------------------------------------- +const FOOTER_PATTERNS: RegExp[] = [ + // Unsubscribe / manage preferences + /unsubscribe\b.{0,200}/gi, + /manage\s+(?:your\s+)?(?:email\s+)?preferences?.{0,100}/gi, + /opt[\s-]?out\b.{0,100}/gi, + /email\s+preferences?.{0,80}/gi, + /update\s+(?:your\s+)?subscription.{0,80}/gi, + /you\s+(?:are\s+)?receiv(?:ed?|ing)\s+this\s+(?:email|message)\s+because.{0,300}/gi, + /this\s+(?:email|message)\s+was\s+sent\s+(?:to|by).{0,200}/gi, + /if\s+you\s+no\s+longer\s+wish\s+to\s+receive.{0,200}/gi, + /to\s+stop\s+receiving\s+these\s+(?:emails|notifications|messages).{0,200}/gi, + + // "Sent from" signatures + /sent\s+from\s+(?:my\s+)?(?:iphone|ipad|galaxy|android|samsung|pixel|outlook|mail).{0,60}/gi, + /get\s+outlook\s+for\s+(?:ios|android).{0,40}/gi, + + // Privacy / legal + /this\s+(?:email|message)\s+(?:and\s+any\s+attachments?\s+)?(?:is|are)\s+(?:intended\s+)?(?:solely\s+)?(?:for\s+the\s+use\s+of).{0,500}/gi, + /confidential(?:ity)?\s+notice.{0,400}/gi, + /disclaimer:.{0,400}/gi, + /©\s*\d{4}.{0,120}/gi, + /all\s+rights\s+reserved\.?/gi, + /privacy\s+policy/gi, + /terms\s+(?:of\s+(?:service|use)|and\s+conditions)/gi, +]; + +// --------------------------------------------------------------------------- +// Core sanitiser +// --------------------------------------------------------------------------- + +export function sanitizeEmailBody(html: string): string { + if (!html || typeof html !== "string") return ""; + + let text = html; + + // 1. Remove