diff --git a/src/config/types.hooks.ts b/src/config/types.hooks.ts index 7ca74605a..ee9e23b83 100644 --- a/src/config/types.hooks.ts +++ b/src/config/types.hooks.ts @@ -67,6 +67,8 @@ export type HooksGmailConfig = { model?: string; /** Optional thinking level override for Gmail hook processing. */ thinking?: "off" | "minimal" | "low" | "medium" | "high"; + /** Sanitise raw HTML email bodies to plain text before injecting into sessions (default: true). */ + sanitizeBody?: boolean; }; export type InternalHookHandlerConfig = { diff --git a/src/config/zod-schema.hooks.ts b/src/config/zod-schema.hooks.ts index 35e74f7af..32eee4b2b 100644 --- a/src/config/zod-schema.hooks.ts +++ b/src/config/zod-schema.hooks.ts @@ -125,6 +125,7 @@ export const HooksGmailSchema = z z.literal("high"), ]) .optional(), + sanitizeBody: z.boolean().optional(), }) .strict() .optional(); diff --git a/src/gateway/hooks-mapping.test.ts b/src/gateway/hooks-mapping.test.ts index 8900ffd07..ece4fcfeb 100644 --- a/src/gateway/hooks-mapping.test.ts +++ b/src/gateway/hooks-mapping.test.ts @@ -153,6 +153,62 @@ describe("hooks mapping", () => { } }); + it("sanitises email body HTML for gmail preset by default", async () => { + const mappings = resolveHookMappings({ presets: ["gmail"] }); + const result = await applyHookMappings(mappings, { + payload: { + messages: [ + { + id: "msg1", + from: "test@example.com", + subject: "Test", + snippet: "Preview", + body: "

Hello world

Unsubscribe from our list

", + }, + ], + }, + headers: {}, + url: baseUrl, + path: "gmail", + }); + expect(result?.ok).toBe(true); + if (result?.ok && result.action?.kind === "agent") { + expect(result.action.message).toContain("Hello world"); + expect(result.action.message).not.toContain("<"); + expect(result.action.message).not.toContain("style"); + expect(result.action.message).not.toMatch(/unsubscribe/i); + } + }); + + it("skips sanitisation when sanitizeBody is false", async () => { + const mappings = resolveHookMappings({ + presets: ["gmail"], + gmail: { sanitizeBody: false }, + }); + const result = await applyHookMappings(mappings, { + payload: { + messages: [ + { + id: "msg1", + from: "test@example.com", + subject: "Test", + snippet: "Preview", + body: "

Hello world

", + }, + ], + }, + headers: {}, + url: baseUrl, + path: "gmail", + }); + expect(result?.ok).toBe(true); + if (result?.ok && result.action?.kind === "agent") { + // Body should still contain HTML tags + expect(result.action.message).toContain("

"); + expect(result.action.message).toContain(""); + } + }); + it("rejects missing message", async () => { const mappings = resolveHookMappings({ mappings: [{ match: { path: "noop" }, action: "agent" }], diff --git a/src/gateway/hooks-mapping.ts b/src/gateway/hooks-mapping.ts index 2ebf9b136..8dc9d2c68 100644 --- a/src/gateway/hooks-mapping.ts +++ b/src/gateway/hooks-mapping.ts @@ -2,6 +2,7 @@ import path from "node:path"; import { pathToFileURL } from "node:url"; import { CONFIG_PATH, type HookMappingConfig, type HooksConfig } from "../config/config.js"; +import { sanitizeEmailBody } from "../hooks/sanitize-email-body.js"; import type { HookMessageChannel } from "./hooks.js"; export type HookMappingResolved = { @@ -22,6 +23,8 @@ export type HookMappingResolved = { thinking?: string; timeoutSeconds?: number; transform?: HookMappingTransformResolved; + /** When true, sanitise HTML email bodies in the payload before template rendering. */ + sanitizeBody?: boolean; }; export type HookMappingTransformResolved = { @@ -103,16 +106,21 @@ type HookTransformFn = ( export function resolveHookMappings(hooks?: HooksConfig): HookMappingResolved[] { const presets = hooks?.presets ?? []; const gmailAllowUnsafe = hooks?.gmail?.allowUnsafeExternalContent; + // sanitizeBody defaults to true unless explicitly set to false + const gmailSanitizeBody = hooks?.gmail?.sanitizeBody !== false; const mappings: HookMappingConfig[] = []; if (hooks?.mappings) mappings.push(...hooks.mappings); for (const preset of presets) { const presetMappings = hookPresetMappings[preset]; if (!presetMappings) continue; - if (preset === "gmail" && typeof gmailAllowUnsafe === "boolean") { + if (preset === "gmail") { mappings.push( ...presetMappings.map((mapping) => ({ ...mapping, - allowUnsafeExternalContent: gmailAllowUnsafe, + ...(typeof gmailAllowUnsafe === "boolean" + ? { allowUnsafeExternalContent: gmailAllowUnsafe } + : {}), + _sanitizeBody: gmailSanitizeBody, })), ); continue; @@ -137,7 +145,10 @@ export async function applyHookMappings( for (const mapping of mappings) { if (!mappingMatches(mapping, ctx)) continue; - const base = buildActionFromMapping(mapping, ctx); + // Sanitise email bodies in-place before template rendering + const effectiveCtx = mapping.sanitizeBody ? sanitizePayloadBodies(ctx) : ctx; + + const base = buildActionFromMapping(mapping, effectiveCtx); if (!base.ok) return base; let override: HookTransformResult = null; @@ -174,6 +185,11 @@ function normalizeHookMapping( } : undefined; + const sanitizeBody = + typeof (mapping as Record)._sanitizeBody === "boolean" + ? ((mapping as Record)._sanitizeBody as boolean) + : undefined; + return { id, matchPath, @@ -192,6 +208,7 @@ function normalizeHookMapping( thinking: mapping.thinking, timeoutSeconds: mapping.timeoutSeconds, transform, + sanitizeBody, }; } @@ -359,6 +376,25 @@ function resolveTemplateExpr(expr: string, ctx: HookMappingContext) { return getByPath(ctx.payload, expr); } +/** + * Return a shallow-cloned context whose `messages[].body` fields have been + * sanitised from raw HTML to clean plain text. The original context is not + * mutated. + */ +function sanitizePayloadBodies(ctx: HookMappingContext): HookMappingContext { + const messages = ctx.payload.messages; + if (!Array.isArray(messages) || messages.length === 0) return ctx; + + const cleaned = messages.map((msg: unknown) => { + if (msg === null || typeof msg !== "object") return msg; + const rec = msg as Record; + if (typeof rec.body !== "string") return msg; + return { ...rec, body: sanitizeEmailBody(rec.body) }; + }); + + return { ...ctx, payload: { ...ctx.payload, messages: cleaned } }; +} + function getByPath(input: Record, pathExpr: string): unknown { if (!pathExpr) return undefined; const parts: Array = []; diff --git a/src/hooks/sanitize-email-body.test.ts b/src/hooks/sanitize-email-body.test.ts new file mode 100644 index 000000000..c80ce5339 --- /dev/null +++ b/src/hooks/sanitize-email-body.test.ts @@ -0,0 +1,230 @@ +import { describe, expect, it } from "vitest"; +import { sanitizeEmailBody } from "./sanitize-email-body.js"; + +describe("sanitizeEmailBody", () => { + it("returns empty string for falsy input", () => { + expect(sanitizeEmailBody("")).toBe(""); + expect(sanitizeEmailBody(null as unknown as string)).toBe(""); + expect(sanitizeEmailBody(undefined as unknown as string)).toBe(""); + }); + + it("passes through plain text unchanged", () => { + expect(sanitizeEmailBody("Hello world")).toBe("Hello world"); + }); + + // --- HTML stripping --- + + it("strips basic HTML tags", () => { + expect(sanitizeEmailBody("

Hello world

")).toBe("Hello world"); + }); + + it("removes style blocks and their contents", () => { + const html = '

Content

'; + expect(sanitizeEmailBody(html)).toBe("Content"); + }); + + it("removes script blocks", () => { + const html = "

Safe

"; + expect(sanitizeEmailBody(html)).toBe("Safe"); + }); + + it("removes HTML comments", () => { + const html = "

Visible

"; + expect(sanitizeEmailBody(html)).toBe("Visible"); + }); + + // --- Newline conversion --- + + it("converts
to newlines", () => { + expect(sanitizeEmailBody("Line 1
Line 2
Line 3")).toBe("Line 1\nLine 2\nLine 3"); + }); + + it("converts block-level closing tags to newlines", () => { + const html = "
Block 1
Block 2
"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Block 1"); + expect(result).toContain("Block 2"); + expect(result).toMatch(/Block 1\n+Block 2/); + }); + + // --- HTML entities --- + + it("decodes named HTML entities", () => { + expect(sanitizeEmailBody("& < > "  ")).toBe('& < > "'); + }); + + it("decodes numeric decimal entities", () => { + expect(sanitizeEmailBody("ABC")).toBe("ABC"); + }); + + it("decodes numeric hex entities", () => { + expect(sanitizeEmailBody("ABC")).toBe("ABC"); + }); + + it("decodes typographic entities", () => { + expect(sanitizeEmailBody("“Hello” — world")).toBe( + "\u201CHello\u201D — world", + ); + }); + + // --- Data URIs / base64 --- + + it("removes base64 data URIs", () => { + const html = 'pic

Text

'; + expect(sanitizeEmailBody(html)).toBe("Text"); + }); + + it("removes inline base64 content", () => { + const html = "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP some text after"; + const result = sanitizeEmailBody(html); + expect(result).not.toContain("base64"); + expect(result).toContain("some text after"); + }); + + // --- Tracking pixels --- + + it("removes 1x1 tracking pixel images", () => { + const html = '

Content

'; + expect(sanitizeEmailBody(html)).toBe("Content"); + }); + + it("removes display:none images", () => { + const html = '

Content

'; + expect(sanitizeEmailBody(html)).toBe("Content"); + }); + + // --- Links --- + + it("keeps link text, drops tracking hrefs", () => { + const html = 'Click here'; + expect(sanitizeEmailBody(html)).toBe(""); + }); + + it("keeps useful link text", () => { + const html = 'Read more'; + expect(sanitizeEmailBody(html)).toBe("Read more"); + }); + + // --- Footer patterns --- + + it("removes unsubscribe text", () => { + const html = "

Real content

Unsubscribe from this mailing list

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Real content"); + expect(result).not.toMatch(/unsubscribe/i); + }); + + it("removes 'sent from my iPhone'", () => { + const html = "

Hey!

Sent from my iPhone

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Hey!"); + expect(result).not.toMatch(/sent from my iphone/i); + }); + + it("removes 'Get Outlook for iOS'", () => { + const html = "

Meeting at 3

Get Outlook for iOS

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Meeting at 3"); + expect(result).not.toMatch(/get outlook/i); + }); + + it("removes confidentiality notices", () => { + const html = + "

Actual content

Confidentiality notice: This email is intended solely for the use of the individual to whom it is addressed.

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Actual content"); + expect(result).not.toMatch(/confidentiality notice/i); + }); + + it("removes copyright notices", () => { + const html = "

Content

© 2024 Acme Corp. All rights reserved.

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Content"); + expect(result).not.toMatch(/all rights reserved/i); + }); + + it("removes 'you are receiving this email because'", () => { + const html = + "

Newsletter

You are receiving this email because you signed up on our website.

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Newsletter"); + expect(result).not.toMatch(/you are receiving/i); + }); + + it("removes privacy policy / terms of service", () => { + const html = "

Content

Privacy Policy | Terms of Service

"; + const result = sanitizeEmailBody(html); + expect(result).toContain("Content"); + expect(result).not.toMatch(/privacy policy/i); + }); + + // --- Whitespace collapsing --- + + it("collapses excessive blank lines to max 2 newlines", () => { + const html = "

A

\n\n\n\n\n

B

"; + const result = sanitizeEmailBody(html); + expect(result).not.toMatch(/\n{3,}/); + expect(result).toContain("A"); + expect(result).toContain("B"); + }); + + it("trims each line and removes leading/trailing whitespace", () => { + const html = "

Hello

World

"; + const result = sanitizeEmailBody(html); + expect(result).not.toMatch(/^\s/); + expect(result).not.toMatch(/\s$/); + for (const line of result.split("\n")) { + expect(line).toBe(line.trim()); + } + }); + + it("collapses multiple spaces to single space", () => { + expect(sanitizeEmailBody("Hello world here")).toBe("Hello world here"); + }); + + // --- Real-world-ish email --- + + it("handles a typical marketing email", () => { + const html = ` + + + + + +
+ Logo +
+
+

Big Sale!

+

Save 50% on everything this weekend.

+

Use code: SAVE50

+ Shop Now +
+ + + + `; + const result = sanitizeEmailBody(html); + + // Should keep + expect(result).toContain("Big Sale!"); + expect(result).toContain("Save 50% on everything this weekend."); + expect(result).toContain("SAVE50"); + expect(result).toContain("Shop Now"); + + // Should remove + expect(result).not.toContain("<"); + expect(result).not.toContain("style"); + expect(result).not.toMatch(/tracker/); + expect(result).not.toMatch(/unsubscribe/i); + expect(result).not.toMatch(/all rights reserved/i); + expect(result).not.toMatch(/privacy policy/i); + }); +}); diff --git a/src/hooks/sanitize-email-body.ts b/src/hooks/sanitize-email-body.ts new file mode 100644 index 000000000..1bcea0c5a --- /dev/null +++ b/src/hooks/sanitize-email-body.ts @@ -0,0 +1,178 @@ +/** + * Sanitise raw HTML email bodies into clean plain text. + * + * Strips HTML tags, CSS, scripts, tracking pixels, email footers, + * base64 data URIs, and excessive whitespace so the body is compact + * and ready for LLM consumption. + */ + +// --------------------------------------------------------------------------- +// Footer / boilerplate patterns (case-insensitive) +// --------------------------------------------------------------------------- +const FOOTER_PATTERNS: RegExp[] = [ + // Unsubscribe / manage preferences + /unsubscribe\b.{0,200}/gi, + /manage\s+(?:your\s+)?(?:email\s+)?preferences?.{0,100}/gi, + /opt[\s-]?out\b.{0,100}/gi, + /email\s+preferences?.{0,80}/gi, + /update\s+(?:your\s+)?subscription.{0,80}/gi, + /you\s+(?:are\s+)?receiv(?:ed?|ing)\s+this\s+(?:email|message)\s+because.{0,300}/gi, + /this\s+(?:email|message)\s+was\s+sent\s+(?:to|by).{0,200}/gi, + /if\s+you\s+no\s+longer\s+wish\s+to\s+receive.{0,200}/gi, + /to\s+stop\s+receiving\s+these\s+(?:emails|notifications|messages).{0,200}/gi, + + // "Sent from" signatures + /sent\s+from\s+(?:my\s+)?(?:iphone|ipad|galaxy|android|samsung|pixel|outlook|mail).{0,60}/gi, + /get\s+outlook\s+for\s+(?:ios|android).{0,40}/gi, + + // Privacy / legal + /this\s+(?:email|message)\s+(?:and\s+any\s+attachments?\s+)?(?:is|are)\s+(?:intended\s+)?(?:solely\s+)?(?:for\s+the\s+use\s+of).{0,500}/gi, + /confidential(?:ity)?\s+notice.{0,400}/gi, + /disclaimer:.{0,400}/gi, + /©\s*\d{4}.{0,120}/gi, + /all\s+rights\s+reserved\.?/gi, + /privacy\s+policy/gi, + /terms\s+(?:of\s+(?:service|use)|and\s+conditions)/gi, +]; + +// --------------------------------------------------------------------------- +// Core sanitiser +// --------------------------------------------------------------------------- + +export function sanitizeEmailBody(html: string): string { + if (!html || typeof html !== "string") return ""; + + let text = html; + + // 1. Remove