feat(gmail): sanitise HTML email bodies before template injection

Strip raw HTML from email bodies in the Gmail hook pipeline so that
clean plain text reaches the agent session instead of bloated HTML
with CSS, tracking pixels, footers, and boilerplate.

What it does:
- Strips HTML tags, <style>/<script> blocks, and HTML comments
- Removes tracking pixels (1x1 images, display:none)
- Removes base64 data URIs and inline encoded images
- Removes common email footer patterns (unsubscribe, sent from iPhone,
  confidentiality notices, copyright, privacy policy, etc.)
- Decodes HTML entities to plain text
- Converts block-level tags and <br> to newlines
- Collapses excessive whitespace (max 2 consecutive newlines)

Config: hooks.gmail.sanitizeBody (boolean, default: true)
Set to false to get the raw HTML body as before.

Sanitisation runs before template rendering in the hook mapping
pipeline, so tokens are saved for all downstream consumers.
This commit is contained in:
Adam Holt 2026-01-29 23:59:40 +00:00
parent 4583f88626
commit 52c584c26d
6 changed files with 506 additions and 3 deletions

View File

@ -67,6 +67,8 @@ export type HooksGmailConfig = {
model?: string;
/** Optional thinking level override for Gmail hook processing. */
thinking?: "off" | "minimal" | "low" | "medium" | "high";
/** Sanitise raw HTML email bodies to plain text before injecting into sessions (default: true). */
sanitizeBody?: boolean;
};
export type InternalHookHandlerConfig = {

View File

@ -125,6 +125,7 @@ export const HooksGmailSchema = z
z.literal("high"),
])
.optional(),
sanitizeBody: z.boolean().optional(),
})
.strict()
.optional();

View File

@ -153,6 +153,62 @@ describe("hooks mapping", () => {
}
});
it("sanitises email body HTML for gmail preset by default", async () => {
const mappings = resolveHookMappings({ presets: ["gmail"] });
const result = await applyHookMappings(mappings, {
payload: {
messages: [
{
id: "msg1",
from: "test@example.com",
subject: "Test",
snippet: "Preview",
body: "<div><p>Hello <b>world</b></p><style>.x{color:red}</style><p>Unsubscribe from our list</p></div>",
},
],
},
headers: {},
url: baseUrl,
path: "gmail",
});
expect(result?.ok).toBe(true);
if (result?.ok && result.action?.kind === "agent") {
expect(result.action.message).toContain("Hello world");
expect(result.action.message).not.toContain("<");
expect(result.action.message).not.toContain("style");
expect(result.action.message).not.toMatch(/unsubscribe/i);
}
});
it("skips sanitisation when sanitizeBody is false", async () => {
const mappings = resolveHookMappings({
presets: ["gmail"],
gmail: { sanitizeBody: false },
});
const result = await applyHookMappings(mappings, {
payload: {
messages: [
{
id: "msg1",
from: "test@example.com",
subject: "Test",
snippet: "Preview",
body: "<p>Hello <b>world</b></p>",
},
],
},
headers: {},
url: baseUrl,
path: "gmail",
});
expect(result?.ok).toBe(true);
if (result?.ok && result.action?.kind === "agent") {
// Body should still contain HTML tags
expect(result.action.message).toContain("<p>");
expect(result.action.message).toContain("<b>");
}
});
it("rejects missing message", async () => {
const mappings = resolveHookMappings({
mappings: [{ match: { path: "noop" }, action: "agent" }],

View File

@ -2,6 +2,7 @@ import path from "node:path";
import { pathToFileURL } from "node:url";
import { CONFIG_PATH, type HookMappingConfig, type HooksConfig } from "../config/config.js";
import { sanitizeEmailBody } from "../hooks/sanitize-email-body.js";
import type { HookMessageChannel } from "./hooks.js";
export type HookMappingResolved = {
@ -22,6 +23,8 @@ export type HookMappingResolved = {
thinking?: string;
timeoutSeconds?: number;
transform?: HookMappingTransformResolved;
/** When true, sanitise HTML email bodies in the payload before template rendering. */
sanitizeBody?: boolean;
};
export type HookMappingTransformResolved = {
@ -103,16 +106,21 @@ type HookTransformFn = (
export function resolveHookMappings(hooks?: HooksConfig): HookMappingResolved[] {
const presets = hooks?.presets ?? [];
const gmailAllowUnsafe = hooks?.gmail?.allowUnsafeExternalContent;
// sanitizeBody defaults to true unless explicitly set to false
const gmailSanitizeBody = hooks?.gmail?.sanitizeBody !== false;
const mappings: HookMappingConfig[] = [];
if (hooks?.mappings) mappings.push(...hooks.mappings);
for (const preset of presets) {
const presetMappings = hookPresetMappings[preset];
if (!presetMappings) continue;
if (preset === "gmail" && typeof gmailAllowUnsafe === "boolean") {
if (preset === "gmail") {
mappings.push(
...presetMappings.map((mapping) => ({
...mapping,
allowUnsafeExternalContent: gmailAllowUnsafe,
...(typeof gmailAllowUnsafe === "boolean"
? { allowUnsafeExternalContent: gmailAllowUnsafe }
: {}),
_sanitizeBody: gmailSanitizeBody,
})),
);
continue;
@ -137,7 +145,10 @@ export async function applyHookMappings(
for (const mapping of mappings) {
if (!mappingMatches(mapping, ctx)) continue;
const base = buildActionFromMapping(mapping, ctx);
// Sanitise email bodies in-place before template rendering
const effectiveCtx = mapping.sanitizeBody ? sanitizePayloadBodies(ctx) : ctx;
const base = buildActionFromMapping(mapping, effectiveCtx);
if (!base.ok) return base;
let override: HookTransformResult = null;
@ -174,6 +185,11 @@ function normalizeHookMapping(
}
: undefined;
const sanitizeBody =
typeof (mapping as Record<string, unknown>)._sanitizeBody === "boolean"
? ((mapping as Record<string, unknown>)._sanitizeBody as boolean)
: undefined;
return {
id,
matchPath,
@ -192,6 +208,7 @@ function normalizeHookMapping(
thinking: mapping.thinking,
timeoutSeconds: mapping.timeoutSeconds,
transform,
sanitizeBody,
};
}
@ -359,6 +376,25 @@ function resolveTemplateExpr(expr: string, ctx: HookMappingContext) {
return getByPath(ctx.payload, expr);
}
/**
* Return a shallow-cloned context whose `messages[].body` fields have been
* sanitised from raw HTML to clean plain text. The original context is not
* mutated.
*/
function sanitizePayloadBodies(ctx: HookMappingContext): HookMappingContext {
const messages = ctx.payload.messages;
if (!Array.isArray(messages) || messages.length === 0) return ctx;
const cleaned = messages.map((msg: unknown) => {
if (msg === null || typeof msg !== "object") return msg;
const rec = msg as Record<string, unknown>;
if (typeof rec.body !== "string") return msg;
return { ...rec, body: sanitizeEmailBody(rec.body) };
});
return { ...ctx, payload: { ...ctx.payload, messages: cleaned } };
}
function getByPath(input: Record<string, unknown>, pathExpr: string): unknown {
if (!pathExpr) return undefined;
const parts: Array<string | number> = [];

View File

@ -0,0 +1,230 @@
import { describe, expect, it } from "vitest";
import { sanitizeEmailBody } from "./sanitize-email-body.js";
describe("sanitizeEmailBody", () => {
it("returns empty string for falsy input", () => {
expect(sanitizeEmailBody("")).toBe("");
expect(sanitizeEmailBody(null as unknown as string)).toBe("");
expect(sanitizeEmailBody(undefined as unknown as string)).toBe("");
});
it("passes through plain text unchanged", () => {
expect(sanitizeEmailBody("Hello world")).toBe("Hello world");
});
// --- HTML stripping ---
it("strips basic HTML tags", () => {
expect(sanitizeEmailBody("<p>Hello <b>world</b></p>")).toBe("Hello world");
});
it("removes style blocks and their contents", () => {
const html = '<style type="text/css">.foo { color: red; }</style><p>Content</p>';
expect(sanitizeEmailBody(html)).toBe("Content");
});
it("removes script blocks", () => {
const html = "<script>alert('xss')</script><p>Safe</p>";
expect(sanitizeEmailBody(html)).toBe("Safe");
});
it("removes HTML comments", () => {
const html = "<!-- comment --><p>Visible</p>";
expect(sanitizeEmailBody(html)).toBe("Visible");
});
// --- Newline conversion ---
it("converts <br> to newlines", () => {
expect(sanitizeEmailBody("Line 1<br>Line 2<br/>Line 3")).toBe("Line 1\nLine 2\nLine 3");
});
it("converts block-level closing tags to newlines", () => {
const html = "<div>Block 1</div><div>Block 2</div>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Block 1");
expect(result).toContain("Block 2");
expect(result).toMatch(/Block 1\n+Block 2/);
});
// --- HTML entities ---
it("decodes named HTML entities", () => {
expect(sanitizeEmailBody("&amp; &lt; &gt; &quot; &nbsp;")).toBe('& < > "');
});
it("decodes numeric decimal entities", () => {
expect(sanitizeEmailBody("&#65;&#66;&#67;")).toBe("ABC");
});
it("decodes numeric hex entities", () => {
expect(sanitizeEmailBody("&#x41;&#x42;&#x43;")).toBe("ABC");
});
it("decodes typographic entities", () => {
expect(sanitizeEmailBody("&ldquo;Hello&rdquo; &mdash; world")).toBe(
"\u201CHello\u201D — world",
);
});
// --- Data URIs / base64 ---
it("removes base64 data URIs", () => {
const html = '<img src="data:image/png;base64,iVBORw0KGgo=" alt="pic"><p>Text</p>';
expect(sanitizeEmailBody(html)).toBe("Text");
});
it("removes inline base64 content", () => {
const html = "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP some text after";
const result = sanitizeEmailBody(html);
expect(result).not.toContain("base64");
expect(result).toContain("some text after");
});
// --- Tracking pixels ---
it("removes 1x1 tracking pixel images", () => {
const html = '<img width="1" height="1" src="https://tracker.com/pixel.gif"><p>Content</p>';
expect(sanitizeEmailBody(html)).toBe("Content");
});
it("removes display:none images", () => {
const html = '<img style="display:none" src="https://tracker.com/pixel.gif"><p>Content</p>';
expect(sanitizeEmailBody(html)).toBe("Content");
});
// --- Links ---
it("keeps link text, drops tracking hrefs", () => {
const html = '<a href="https://example.com/unsubscribe">Click here</a>';
expect(sanitizeEmailBody(html)).toBe("");
});
it("keeps useful link text", () => {
const html = '<a href="https://example.com/article">Read more</a>';
expect(sanitizeEmailBody(html)).toBe("Read more");
});
// --- Footer patterns ---
it("removes unsubscribe text", () => {
const html = "<p>Real content</p><p>Unsubscribe from this mailing list</p>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Real content");
expect(result).not.toMatch(/unsubscribe/i);
});
it("removes 'sent from my iPhone'", () => {
const html = "<p>Hey!</p><p>Sent from my iPhone</p>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Hey!");
expect(result).not.toMatch(/sent from my iphone/i);
});
it("removes 'Get Outlook for iOS'", () => {
const html = "<p>Meeting at 3</p><p>Get Outlook for iOS</p>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Meeting at 3");
expect(result).not.toMatch(/get outlook/i);
});
it("removes confidentiality notices", () => {
const html =
"<p>Actual content</p><p>Confidentiality notice: This email is intended solely for the use of the individual to whom it is addressed.</p>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Actual content");
expect(result).not.toMatch(/confidentiality notice/i);
});
it("removes copyright notices", () => {
const html = "<p>Content</p><p>© 2024 Acme Corp. All rights reserved.</p>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Content");
expect(result).not.toMatch(/all rights reserved/i);
});
it("removes 'you are receiving this email because'", () => {
const html =
"<p>Newsletter</p><p>You are receiving this email because you signed up on our website.</p>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Newsletter");
expect(result).not.toMatch(/you are receiving/i);
});
it("removes privacy policy / terms of service", () => {
const html = "<p>Content</p><p>Privacy Policy | Terms of Service</p>";
const result = sanitizeEmailBody(html);
expect(result).toContain("Content");
expect(result).not.toMatch(/privacy policy/i);
});
// --- Whitespace collapsing ---
it("collapses excessive blank lines to max 2 newlines", () => {
const html = "<p>A</p>\n\n\n\n\n<p>B</p>";
const result = sanitizeEmailBody(html);
expect(result).not.toMatch(/\n{3,}/);
expect(result).toContain("A");
expect(result).toContain("B");
});
it("trims each line and removes leading/trailing whitespace", () => {
const html = " <p> Hello </p> <p> World </p> ";
const result = sanitizeEmailBody(html);
expect(result).not.toMatch(/^\s/);
expect(result).not.toMatch(/\s$/);
for (const line of result.split("\n")) {
expect(line).toBe(line.trim());
}
});
it("collapses multiple spaces to single space", () => {
expect(sanitizeEmailBody("Hello world here")).toBe("Hello world here");
});
// --- Real-world-ish email ---
it("handles a typical marketing email", () => {
const html = `
<html>
<head>
<style>.header { background: blue; } .footer { font-size: 10px; }</style>
</head>
<body>
<div class="header">
<img src="https://cdn.example.com/logo.png" alt="Logo">
</div>
<div class="content">
<h1>Big Sale!</h1>
<p>Save 50% on everything this weekend.</p>
<p>Use code: <b>SAVE50</b></p>
<a href="https://shop.example.com">Shop Now</a>
</div>
<div class="footer">
<img width="1" height="1" src="https://tracker.example.com/open.gif">
<p>You are receiving this email because you subscribed to our newsletter.</p>
<p><a href="https://example.com/unsubscribe?id=123">Unsubscribe</a> |
<a href="https://example.com/manage-preferences">Manage Preferences</a></p>
<p>© 2024 Example Corp. All rights reserved.</p>
<p>Privacy Policy | Terms of Service</p>
</div>
</body>
</html>
`;
const result = sanitizeEmailBody(html);
// Should keep
expect(result).toContain("Big Sale!");
expect(result).toContain("Save 50% on everything this weekend.");
expect(result).toContain("SAVE50");
expect(result).toContain("Shop Now");
// Should remove
expect(result).not.toContain("<");
expect(result).not.toContain("style");
expect(result).not.toMatch(/tracker/);
expect(result).not.toMatch(/unsubscribe/i);
expect(result).not.toMatch(/all rights reserved/i);
expect(result).not.toMatch(/privacy policy/i);
});
});

View File

@ -0,0 +1,178 @@
/**
* Sanitise raw HTML email bodies into clean plain text.
*
* Strips HTML tags, CSS, scripts, tracking pixels, email footers,
* base64 data URIs, and excessive whitespace so the body is compact
* and ready for LLM consumption.
*/
// ---------------------------------------------------------------------------
// Footer / boilerplate patterns (case-insensitive)
// ---------------------------------------------------------------------------
const FOOTER_PATTERNS: RegExp[] = [
// Unsubscribe / manage preferences
/unsubscribe\b.{0,200}/gi,
/manage\s+(?:your\s+)?(?:email\s+)?preferences?.{0,100}/gi,
/opt[\s-]?out\b.{0,100}/gi,
/email\s+preferences?.{0,80}/gi,
/update\s+(?:your\s+)?subscription.{0,80}/gi,
/you\s+(?:are\s+)?receiv(?:ed?|ing)\s+this\s+(?:email|message)\s+because.{0,300}/gi,
/this\s+(?:email|message)\s+was\s+sent\s+(?:to|by).{0,200}/gi,
/if\s+you\s+no\s+longer\s+wish\s+to\s+receive.{0,200}/gi,
/to\s+stop\s+receiving\s+these\s+(?:emails|notifications|messages).{0,200}/gi,
// "Sent from" signatures
/sent\s+from\s+(?:my\s+)?(?:iphone|ipad|galaxy|android|samsung|pixel|outlook|mail).{0,60}/gi,
/get\s+outlook\s+for\s+(?:ios|android).{0,40}/gi,
// Privacy / legal
/this\s+(?:email|message)\s+(?:and\s+any\s+attachments?\s+)?(?:is|are)\s+(?:intended\s+)?(?:solely\s+)?(?:for\s+the\s+use\s+of).{0,500}/gi,
/confidential(?:ity)?\s+notice.{0,400}/gi,
/disclaimer:.{0,400}/gi,
/©\s*\d{4}.{0,120}/gi,
/all\s+rights\s+reserved\.?/gi,
/privacy\s+policy/gi,
/terms\s+(?:of\s+(?:service|use)|and\s+conditions)/gi,
];
// ---------------------------------------------------------------------------
// Core sanitiser
// ---------------------------------------------------------------------------
export function sanitizeEmailBody(html: string): string {
if (!html || typeof html !== "string") return "";
let text = html;
// 1. Remove <style> blocks (including contents)
text = text.replace(/<style[^>]*>[\s\S]*?<\/style\s*>/gi, "");
// 2. Remove <script> blocks
text = text.replace(/<script[^>]*>[\s\S]*?<\/script\s*>/gi, "");
// 3. Remove HTML comments
text = text.replace(/<!--[\s\S]*?-->/g, "");
// 4. Remove tracking pixels / invisible images (1x1, hidden, display:none)
text = text.replace(
/<img[^>]*(?:width\s*=\s*["']?1["']?|height\s*=\s*["']?1["']?|display\s*:\s*none)[^>]*\/?>/gi,
"",
);
// 5. Remove all data URIs and base64 images
text = text.replace(/data:[^;]+;base64,[A-Za-z0-9+/=]+/g, "");
text = text.replace(/src\s*=\s*["']data:[^"']*["']/gi, "");
// 6. Convert <br> and block-level tags to newlines
text = text.replace(/<br\s*\/?>/gi, "\n");
text = text.replace(
/<\/(?:p|div|tr|li|h[1-6]|blockquote|section|article|header|footer|table|thead|tbody)>/gi,
"\n",
);
text = text.replace(
/<(?:p|div|tr|li|h[1-6]|blockquote|section|article|header|footer|hr)[^>]*>/gi,
"\n",
);
// 7. Convert <a href="...">text</a> to text (URL)
text = text.replace(
/<a\s[^>]*href\s*=\s*["']([^"']*)["'][^>]*>([\s\S]*?)<\/a\s*>/gi,
(_m, href: string, inner: string) => {
const linkText = inner.replace(/<[^>]*>/g, "").trim();
if (!linkText) return "";
// Skip if href is a tracking/unsubscribe link or same as text
if (href === linkText) return linkText;
if (/unsubscribe|optout|opt-out|manage.preferences/i.test(href)) return "";
return linkText;
},
);
// 8. Strip all remaining HTML tags
text = text.replace(/<[^>]+>/g, "");
// 9. Decode HTML entities
text = decodeHtmlEntities(text);
// 10. Remove footer / boilerplate patterns
for (const pattern of FOOTER_PATTERNS) {
text = text.replace(pattern, "");
}
// 11. Collapse whitespace
// Replace tabs and multiple spaces (but not newlines) with single space
text = text.replace(/[^\S\n]+/g, " ");
// Trim each line
text = text
.split("\n")
.map((line) => line.trim())
.join("\n");
// Collapse 3+ consecutive newlines to 2
text = text.replace(/\n{3,}/g, "\n\n");
// Trim leading/trailing whitespace
text = text.trim();
return text;
}
// ---------------------------------------------------------------------------
// HTML entity decoder (no dependency needed for common entities)
// ---------------------------------------------------------------------------
const ENTITY_MAP: Record<string, string> = {
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"&quot;": '"',
"&#39;": "'",
"&apos;": "'",
"&nbsp;": " ",
"&ndash;": "",
"&mdash;": "—",
"&lsquo;": "\u2018",
"&rsquo;": "\u2019",
"&ldquo;": "\u201C",
"&rdquo;": "\u201D",
"&bull;": "•",
"&hellip;": "…",
"&copy;": "©",
"&reg;": "®",
"&trade;": "™",
"&euro;": "€",
"&pound;": "£",
"&yen;": "¥",
"&cent;": "¢",
"&deg;": "°",
"&times;": "×",
"&divide;": "÷",
"&para;": "¶",
"&sect;": "§",
"&laquo;": "«",
"&raquo;": "»",
};
function decodeHtmlEntities(text: string): string {
// Named entities
let result = text.replace(/&[a-zA-Z]+;/g, (entity) => {
return ENTITY_MAP[entity.toLowerCase()] ?? entity;
});
// Numeric decimal entities: &#123;
result = result.replace(/&#(\d+);/g, (_m, code: string) => {
const num = Number.parseInt(code, 10);
if (num > 0 && num < 0x110000) {
return String.fromCodePoint(num);
}
return _m;
});
// Numeric hex entities: &#x1F4A9;
result = result.replace(/&#x([0-9a-fA-F]+);/g, (_m, hex: string) => {
const num = Number.parseInt(hex, 16);
if (num > 0 && num < 0x110000) {
return String.fromCodePoint(num);
}
return _m;
});
return result;
}