feat(gmail): sanitise HTML email bodies before template injection
Strip raw HTML from email bodies in the Gmail hook pipeline so that clean plain text reaches the agent session instead of bloated HTML with CSS, tracking pixels, footers, and boilerplate. What it does: - Strips HTML tags, <style>/<script> blocks, and HTML comments - Removes tracking pixels (1x1 images, display:none) - Removes base64 data URIs and inline encoded images - Removes common email footer patterns (unsubscribe, sent from iPhone, confidentiality notices, copyright, privacy policy, etc.) - Decodes HTML entities to plain text - Converts block-level tags and <br> to newlines - Collapses excessive whitespace (max 2 consecutive newlines) Config: hooks.gmail.sanitizeBody (boolean, default: true) Set to false to get the raw HTML body as before. Sanitisation runs before template rendering in the hook mapping pipeline, so tokens are saved for all downstream consumers.
This commit is contained in:
parent
4583f88626
commit
52c584c26d
@ -67,6 +67,8 @@ export type HooksGmailConfig = {
|
||||
model?: string;
|
||||
/** Optional thinking level override for Gmail hook processing. */
|
||||
thinking?: "off" | "minimal" | "low" | "medium" | "high";
|
||||
/** Sanitise raw HTML email bodies to plain text before injecting into sessions (default: true). */
|
||||
sanitizeBody?: boolean;
|
||||
};
|
||||
|
||||
export type InternalHookHandlerConfig = {
|
||||
|
||||
@ -125,6 +125,7 @@ export const HooksGmailSchema = z
|
||||
z.literal("high"),
|
||||
])
|
||||
.optional(),
|
||||
sanitizeBody: z.boolean().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional();
|
||||
|
||||
@ -153,6 +153,62 @@ describe("hooks mapping", () => {
|
||||
}
|
||||
});
|
||||
|
||||
it("sanitises email body HTML for gmail preset by default", async () => {
|
||||
const mappings = resolveHookMappings({ presets: ["gmail"] });
|
||||
const result = await applyHookMappings(mappings, {
|
||||
payload: {
|
||||
messages: [
|
||||
{
|
||||
id: "msg1",
|
||||
from: "test@example.com",
|
||||
subject: "Test",
|
||||
snippet: "Preview",
|
||||
body: "<div><p>Hello <b>world</b></p><style>.x{color:red}</style><p>Unsubscribe from our list</p></div>",
|
||||
},
|
||||
],
|
||||
},
|
||||
headers: {},
|
||||
url: baseUrl,
|
||||
path: "gmail",
|
||||
});
|
||||
expect(result?.ok).toBe(true);
|
||||
if (result?.ok && result.action?.kind === "agent") {
|
||||
expect(result.action.message).toContain("Hello world");
|
||||
expect(result.action.message).not.toContain("<");
|
||||
expect(result.action.message).not.toContain("style");
|
||||
expect(result.action.message).not.toMatch(/unsubscribe/i);
|
||||
}
|
||||
});
|
||||
|
||||
it("skips sanitisation when sanitizeBody is false", async () => {
|
||||
const mappings = resolveHookMappings({
|
||||
presets: ["gmail"],
|
||||
gmail: { sanitizeBody: false },
|
||||
});
|
||||
const result = await applyHookMappings(mappings, {
|
||||
payload: {
|
||||
messages: [
|
||||
{
|
||||
id: "msg1",
|
||||
from: "test@example.com",
|
||||
subject: "Test",
|
||||
snippet: "Preview",
|
||||
body: "<p>Hello <b>world</b></p>",
|
||||
},
|
||||
],
|
||||
},
|
||||
headers: {},
|
||||
url: baseUrl,
|
||||
path: "gmail",
|
||||
});
|
||||
expect(result?.ok).toBe(true);
|
||||
if (result?.ok && result.action?.kind === "agent") {
|
||||
// Body should still contain HTML tags
|
||||
expect(result.action.message).toContain("<p>");
|
||||
expect(result.action.message).toContain("<b>");
|
||||
}
|
||||
});
|
||||
|
||||
it("rejects missing message", async () => {
|
||||
const mappings = resolveHookMappings({
|
||||
mappings: [{ match: { path: "noop" }, action: "agent" }],
|
||||
|
||||
@ -2,6 +2,7 @@ import path from "node:path";
|
||||
import { pathToFileURL } from "node:url";
|
||||
|
||||
import { CONFIG_PATH, type HookMappingConfig, type HooksConfig } from "../config/config.js";
|
||||
import { sanitizeEmailBody } from "../hooks/sanitize-email-body.js";
|
||||
import type { HookMessageChannel } from "./hooks.js";
|
||||
|
||||
export type HookMappingResolved = {
|
||||
@ -22,6 +23,8 @@ export type HookMappingResolved = {
|
||||
thinking?: string;
|
||||
timeoutSeconds?: number;
|
||||
transform?: HookMappingTransformResolved;
|
||||
/** When true, sanitise HTML email bodies in the payload before template rendering. */
|
||||
sanitizeBody?: boolean;
|
||||
};
|
||||
|
||||
export type HookMappingTransformResolved = {
|
||||
@ -103,16 +106,21 @@ type HookTransformFn = (
|
||||
export function resolveHookMappings(hooks?: HooksConfig): HookMappingResolved[] {
|
||||
const presets = hooks?.presets ?? [];
|
||||
const gmailAllowUnsafe = hooks?.gmail?.allowUnsafeExternalContent;
|
||||
// sanitizeBody defaults to true unless explicitly set to false
|
||||
const gmailSanitizeBody = hooks?.gmail?.sanitizeBody !== false;
|
||||
const mappings: HookMappingConfig[] = [];
|
||||
if (hooks?.mappings) mappings.push(...hooks.mappings);
|
||||
for (const preset of presets) {
|
||||
const presetMappings = hookPresetMappings[preset];
|
||||
if (!presetMappings) continue;
|
||||
if (preset === "gmail" && typeof gmailAllowUnsafe === "boolean") {
|
||||
if (preset === "gmail") {
|
||||
mappings.push(
|
||||
...presetMappings.map((mapping) => ({
|
||||
...mapping,
|
||||
allowUnsafeExternalContent: gmailAllowUnsafe,
|
||||
...(typeof gmailAllowUnsafe === "boolean"
|
||||
? { allowUnsafeExternalContent: gmailAllowUnsafe }
|
||||
: {}),
|
||||
_sanitizeBody: gmailSanitizeBody,
|
||||
})),
|
||||
);
|
||||
continue;
|
||||
@ -137,7 +145,10 @@ export async function applyHookMappings(
|
||||
for (const mapping of mappings) {
|
||||
if (!mappingMatches(mapping, ctx)) continue;
|
||||
|
||||
const base = buildActionFromMapping(mapping, ctx);
|
||||
// Sanitise email bodies in-place before template rendering
|
||||
const effectiveCtx = mapping.sanitizeBody ? sanitizePayloadBodies(ctx) : ctx;
|
||||
|
||||
const base = buildActionFromMapping(mapping, effectiveCtx);
|
||||
if (!base.ok) return base;
|
||||
|
||||
let override: HookTransformResult = null;
|
||||
@ -174,6 +185,11 @@ function normalizeHookMapping(
|
||||
}
|
||||
: undefined;
|
||||
|
||||
const sanitizeBody =
|
||||
typeof (mapping as Record<string, unknown>)._sanitizeBody === "boolean"
|
||||
? ((mapping as Record<string, unknown>)._sanitizeBody as boolean)
|
||||
: undefined;
|
||||
|
||||
return {
|
||||
id,
|
||||
matchPath,
|
||||
@ -192,6 +208,7 @@ function normalizeHookMapping(
|
||||
thinking: mapping.thinking,
|
||||
timeoutSeconds: mapping.timeoutSeconds,
|
||||
transform,
|
||||
sanitizeBody,
|
||||
};
|
||||
}
|
||||
|
||||
@ -359,6 +376,25 @@ function resolveTemplateExpr(expr: string, ctx: HookMappingContext) {
|
||||
return getByPath(ctx.payload, expr);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a shallow-cloned context whose `messages[].body` fields have been
|
||||
* sanitised from raw HTML to clean plain text. The original context is not
|
||||
* mutated.
|
||||
*/
|
||||
function sanitizePayloadBodies(ctx: HookMappingContext): HookMappingContext {
|
||||
const messages = ctx.payload.messages;
|
||||
if (!Array.isArray(messages) || messages.length === 0) return ctx;
|
||||
|
||||
const cleaned = messages.map((msg: unknown) => {
|
||||
if (msg === null || typeof msg !== "object") return msg;
|
||||
const rec = msg as Record<string, unknown>;
|
||||
if (typeof rec.body !== "string") return msg;
|
||||
return { ...rec, body: sanitizeEmailBody(rec.body) };
|
||||
});
|
||||
|
||||
return { ...ctx, payload: { ...ctx.payload, messages: cleaned } };
|
||||
}
|
||||
|
||||
function getByPath(input: Record<string, unknown>, pathExpr: string): unknown {
|
||||
if (!pathExpr) return undefined;
|
||||
const parts: Array<string | number> = [];
|
||||
|
||||
230
src/hooks/sanitize-email-body.test.ts
Normal file
230
src/hooks/sanitize-email-body.test.ts
Normal file
@ -0,0 +1,230 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { sanitizeEmailBody } from "./sanitize-email-body.js";
|
||||
|
||||
describe("sanitizeEmailBody", () => {
|
||||
it("returns empty string for falsy input", () => {
|
||||
expect(sanitizeEmailBody("")).toBe("");
|
||||
expect(sanitizeEmailBody(null as unknown as string)).toBe("");
|
||||
expect(sanitizeEmailBody(undefined as unknown as string)).toBe("");
|
||||
});
|
||||
|
||||
it("passes through plain text unchanged", () => {
|
||||
expect(sanitizeEmailBody("Hello world")).toBe("Hello world");
|
||||
});
|
||||
|
||||
// --- HTML stripping ---
|
||||
|
||||
it("strips basic HTML tags", () => {
|
||||
expect(sanitizeEmailBody("<p>Hello <b>world</b></p>")).toBe("Hello world");
|
||||
});
|
||||
|
||||
it("removes style blocks and their contents", () => {
|
||||
const html = '<style type="text/css">.foo { color: red; }</style><p>Content</p>';
|
||||
expect(sanitizeEmailBody(html)).toBe("Content");
|
||||
});
|
||||
|
||||
it("removes script blocks", () => {
|
||||
const html = "<script>alert('xss')</script><p>Safe</p>";
|
||||
expect(sanitizeEmailBody(html)).toBe("Safe");
|
||||
});
|
||||
|
||||
it("removes HTML comments", () => {
|
||||
const html = "<!-- comment --><p>Visible</p>";
|
||||
expect(sanitizeEmailBody(html)).toBe("Visible");
|
||||
});
|
||||
|
||||
// --- Newline conversion ---
|
||||
|
||||
it("converts <br> to newlines", () => {
|
||||
expect(sanitizeEmailBody("Line 1<br>Line 2<br/>Line 3")).toBe("Line 1\nLine 2\nLine 3");
|
||||
});
|
||||
|
||||
it("converts block-level closing tags to newlines", () => {
|
||||
const html = "<div>Block 1</div><div>Block 2</div>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Block 1");
|
||||
expect(result).toContain("Block 2");
|
||||
expect(result).toMatch(/Block 1\n+Block 2/);
|
||||
});
|
||||
|
||||
// --- HTML entities ---
|
||||
|
||||
it("decodes named HTML entities", () => {
|
||||
expect(sanitizeEmailBody("& < > " ")).toBe('& < > "');
|
||||
});
|
||||
|
||||
it("decodes numeric decimal entities", () => {
|
||||
expect(sanitizeEmailBody("ABC")).toBe("ABC");
|
||||
});
|
||||
|
||||
it("decodes numeric hex entities", () => {
|
||||
expect(sanitizeEmailBody("ABC")).toBe("ABC");
|
||||
});
|
||||
|
||||
it("decodes typographic entities", () => {
|
||||
expect(sanitizeEmailBody("“Hello” — world")).toBe(
|
||||
"\u201CHello\u201D — world",
|
||||
);
|
||||
});
|
||||
|
||||
// --- Data URIs / base64 ---
|
||||
|
||||
it("removes base64 data URIs", () => {
|
||||
const html = '<img src="data:image/png;base64,iVBORw0KGgo=" alt="pic"><p>Text</p>';
|
||||
expect(sanitizeEmailBody(html)).toBe("Text");
|
||||
});
|
||||
|
||||
it("removes inline base64 content", () => {
|
||||
const html = "data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP some text after";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).not.toContain("base64");
|
||||
expect(result).toContain("some text after");
|
||||
});
|
||||
|
||||
// --- Tracking pixels ---
|
||||
|
||||
it("removes 1x1 tracking pixel images", () => {
|
||||
const html = '<img width="1" height="1" src="https://tracker.com/pixel.gif"><p>Content</p>';
|
||||
expect(sanitizeEmailBody(html)).toBe("Content");
|
||||
});
|
||||
|
||||
it("removes display:none images", () => {
|
||||
const html = '<img style="display:none" src="https://tracker.com/pixel.gif"><p>Content</p>';
|
||||
expect(sanitizeEmailBody(html)).toBe("Content");
|
||||
});
|
||||
|
||||
// --- Links ---
|
||||
|
||||
it("keeps link text, drops tracking hrefs", () => {
|
||||
const html = '<a href="https://example.com/unsubscribe">Click here</a>';
|
||||
expect(sanitizeEmailBody(html)).toBe("");
|
||||
});
|
||||
|
||||
it("keeps useful link text", () => {
|
||||
const html = '<a href="https://example.com/article">Read more</a>';
|
||||
expect(sanitizeEmailBody(html)).toBe("Read more");
|
||||
});
|
||||
|
||||
// --- Footer patterns ---
|
||||
|
||||
it("removes unsubscribe text", () => {
|
||||
const html = "<p>Real content</p><p>Unsubscribe from this mailing list</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Real content");
|
||||
expect(result).not.toMatch(/unsubscribe/i);
|
||||
});
|
||||
|
||||
it("removes 'sent from my iPhone'", () => {
|
||||
const html = "<p>Hey!</p><p>Sent from my iPhone</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Hey!");
|
||||
expect(result).not.toMatch(/sent from my iphone/i);
|
||||
});
|
||||
|
||||
it("removes 'Get Outlook for iOS'", () => {
|
||||
const html = "<p>Meeting at 3</p><p>Get Outlook for iOS</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Meeting at 3");
|
||||
expect(result).not.toMatch(/get outlook/i);
|
||||
});
|
||||
|
||||
it("removes confidentiality notices", () => {
|
||||
const html =
|
||||
"<p>Actual content</p><p>Confidentiality notice: This email is intended solely for the use of the individual to whom it is addressed.</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Actual content");
|
||||
expect(result).not.toMatch(/confidentiality notice/i);
|
||||
});
|
||||
|
||||
it("removes copyright notices", () => {
|
||||
const html = "<p>Content</p><p>© 2024 Acme Corp. All rights reserved.</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Content");
|
||||
expect(result).not.toMatch(/all rights reserved/i);
|
||||
});
|
||||
|
||||
it("removes 'you are receiving this email because'", () => {
|
||||
const html =
|
||||
"<p>Newsletter</p><p>You are receiving this email because you signed up on our website.</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Newsletter");
|
||||
expect(result).not.toMatch(/you are receiving/i);
|
||||
});
|
||||
|
||||
it("removes privacy policy / terms of service", () => {
|
||||
const html = "<p>Content</p><p>Privacy Policy | Terms of Service</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).toContain("Content");
|
||||
expect(result).not.toMatch(/privacy policy/i);
|
||||
});
|
||||
|
||||
// --- Whitespace collapsing ---
|
||||
|
||||
it("collapses excessive blank lines to max 2 newlines", () => {
|
||||
const html = "<p>A</p>\n\n\n\n\n<p>B</p>";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).not.toMatch(/\n{3,}/);
|
||||
expect(result).toContain("A");
|
||||
expect(result).toContain("B");
|
||||
});
|
||||
|
||||
it("trims each line and removes leading/trailing whitespace", () => {
|
||||
const html = " <p> Hello </p> <p> World </p> ";
|
||||
const result = sanitizeEmailBody(html);
|
||||
expect(result).not.toMatch(/^\s/);
|
||||
expect(result).not.toMatch(/\s$/);
|
||||
for (const line of result.split("\n")) {
|
||||
expect(line).toBe(line.trim());
|
||||
}
|
||||
});
|
||||
|
||||
it("collapses multiple spaces to single space", () => {
|
||||
expect(sanitizeEmailBody("Hello world here")).toBe("Hello world here");
|
||||
});
|
||||
|
||||
// --- Real-world-ish email ---
|
||||
|
||||
it("handles a typical marketing email", () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<style>.header { background: blue; } .footer { font-size: 10px; }</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<img src="https://cdn.example.com/logo.png" alt="Logo">
|
||||
</div>
|
||||
<div class="content">
|
||||
<h1>Big Sale!</h1>
|
||||
<p>Save 50% on everything this weekend.</p>
|
||||
<p>Use code: <b>SAVE50</b></p>
|
||||
<a href="https://shop.example.com">Shop Now</a>
|
||||
</div>
|
||||
<div class="footer">
|
||||
<img width="1" height="1" src="https://tracker.example.com/open.gif">
|
||||
<p>You are receiving this email because you subscribed to our newsletter.</p>
|
||||
<p><a href="https://example.com/unsubscribe?id=123">Unsubscribe</a> |
|
||||
<a href="https://example.com/manage-preferences">Manage Preferences</a></p>
|
||||
<p>© 2024 Example Corp. All rights reserved.</p>
|
||||
<p>Privacy Policy | Terms of Service</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
const result = sanitizeEmailBody(html);
|
||||
|
||||
// Should keep
|
||||
expect(result).toContain("Big Sale!");
|
||||
expect(result).toContain("Save 50% on everything this weekend.");
|
||||
expect(result).toContain("SAVE50");
|
||||
expect(result).toContain("Shop Now");
|
||||
|
||||
// Should remove
|
||||
expect(result).not.toContain("<");
|
||||
expect(result).not.toContain("style");
|
||||
expect(result).not.toMatch(/tracker/);
|
||||
expect(result).not.toMatch(/unsubscribe/i);
|
||||
expect(result).not.toMatch(/all rights reserved/i);
|
||||
expect(result).not.toMatch(/privacy policy/i);
|
||||
});
|
||||
});
|
||||
178
src/hooks/sanitize-email-body.ts
Normal file
178
src/hooks/sanitize-email-body.ts
Normal file
@ -0,0 +1,178 @@
|
||||
/**
|
||||
* Sanitise raw HTML email bodies into clean plain text.
|
||||
*
|
||||
* Strips HTML tags, CSS, scripts, tracking pixels, email footers,
|
||||
* base64 data URIs, and excessive whitespace so the body is compact
|
||||
* and ready for LLM consumption.
|
||||
*/
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Footer / boilerplate patterns (case-insensitive)
|
||||
// ---------------------------------------------------------------------------
|
||||
const FOOTER_PATTERNS: RegExp[] = [
|
||||
// Unsubscribe / manage preferences
|
||||
/unsubscribe\b.{0,200}/gi,
|
||||
/manage\s+(?:your\s+)?(?:email\s+)?preferences?.{0,100}/gi,
|
||||
/opt[\s-]?out\b.{0,100}/gi,
|
||||
/email\s+preferences?.{0,80}/gi,
|
||||
/update\s+(?:your\s+)?subscription.{0,80}/gi,
|
||||
/you\s+(?:are\s+)?receiv(?:ed?|ing)\s+this\s+(?:email|message)\s+because.{0,300}/gi,
|
||||
/this\s+(?:email|message)\s+was\s+sent\s+(?:to|by).{0,200}/gi,
|
||||
/if\s+you\s+no\s+longer\s+wish\s+to\s+receive.{0,200}/gi,
|
||||
/to\s+stop\s+receiving\s+these\s+(?:emails|notifications|messages).{0,200}/gi,
|
||||
|
||||
// "Sent from" signatures
|
||||
/sent\s+from\s+(?:my\s+)?(?:iphone|ipad|galaxy|android|samsung|pixel|outlook|mail).{0,60}/gi,
|
||||
/get\s+outlook\s+for\s+(?:ios|android).{0,40}/gi,
|
||||
|
||||
// Privacy / legal
|
||||
/this\s+(?:email|message)\s+(?:and\s+any\s+attachments?\s+)?(?:is|are)\s+(?:intended\s+)?(?:solely\s+)?(?:for\s+the\s+use\s+of).{0,500}/gi,
|
||||
/confidential(?:ity)?\s+notice.{0,400}/gi,
|
||||
/disclaimer:.{0,400}/gi,
|
||||
/©\s*\d{4}.{0,120}/gi,
|
||||
/all\s+rights\s+reserved\.?/gi,
|
||||
/privacy\s+policy/gi,
|
||||
/terms\s+(?:of\s+(?:service|use)|and\s+conditions)/gi,
|
||||
];
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Core sanitiser
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function sanitizeEmailBody(html: string): string {
|
||||
if (!html || typeof html !== "string") return "";
|
||||
|
||||
let text = html;
|
||||
|
||||
// 1. Remove <style> blocks (including contents)
|
||||
text = text.replace(/<style[^>]*>[\s\S]*?<\/style\s*>/gi, "");
|
||||
|
||||
// 2. Remove <script> blocks
|
||||
text = text.replace(/<script[^>]*>[\s\S]*?<\/script\s*>/gi, "");
|
||||
|
||||
// 3. Remove HTML comments
|
||||
text = text.replace(/<!--[\s\S]*?-->/g, "");
|
||||
|
||||
// 4. Remove tracking pixels / invisible images (1x1, hidden, display:none)
|
||||
text = text.replace(
|
||||
/<img[^>]*(?:width\s*=\s*["']?1["']?|height\s*=\s*["']?1["']?|display\s*:\s*none)[^>]*\/?>/gi,
|
||||
"",
|
||||
);
|
||||
|
||||
// 5. Remove all data URIs and base64 images
|
||||
text = text.replace(/data:[^;]+;base64,[A-Za-z0-9+/=]+/g, "");
|
||||
text = text.replace(/src\s*=\s*["']data:[^"']*["']/gi, "");
|
||||
|
||||
// 6. Convert <br> and block-level tags to newlines
|
||||
text = text.replace(/<br\s*\/?>/gi, "\n");
|
||||
text = text.replace(
|
||||
/<\/(?:p|div|tr|li|h[1-6]|blockquote|section|article|header|footer|table|thead|tbody)>/gi,
|
||||
"\n",
|
||||
);
|
||||
text = text.replace(
|
||||
/<(?:p|div|tr|li|h[1-6]|blockquote|section|article|header|footer|hr)[^>]*>/gi,
|
||||
"\n",
|
||||
);
|
||||
|
||||
// 7. Convert <a href="...">text</a> to text (URL)
|
||||
text = text.replace(
|
||||
/<a\s[^>]*href\s*=\s*["']([^"']*)["'][^>]*>([\s\S]*?)<\/a\s*>/gi,
|
||||
(_m, href: string, inner: string) => {
|
||||
const linkText = inner.replace(/<[^>]*>/g, "").trim();
|
||||
if (!linkText) return "";
|
||||
// Skip if href is a tracking/unsubscribe link or same as text
|
||||
if (href === linkText) return linkText;
|
||||
if (/unsubscribe|optout|opt-out|manage.preferences/i.test(href)) return "";
|
||||
return linkText;
|
||||
},
|
||||
);
|
||||
|
||||
// 8. Strip all remaining HTML tags
|
||||
text = text.replace(/<[^>]+>/g, "");
|
||||
|
||||
// 9. Decode HTML entities
|
||||
text = decodeHtmlEntities(text);
|
||||
|
||||
// 10. Remove footer / boilerplate patterns
|
||||
for (const pattern of FOOTER_PATTERNS) {
|
||||
text = text.replace(pattern, "");
|
||||
}
|
||||
|
||||
// 11. Collapse whitespace
|
||||
// Replace tabs and multiple spaces (but not newlines) with single space
|
||||
text = text.replace(/[^\S\n]+/g, " ");
|
||||
// Trim each line
|
||||
text = text
|
||||
.split("\n")
|
||||
.map((line) => line.trim())
|
||||
.join("\n");
|
||||
// Collapse 3+ consecutive newlines to 2
|
||||
text = text.replace(/\n{3,}/g, "\n\n");
|
||||
// Trim leading/trailing whitespace
|
||||
text = text.trim();
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTML entity decoder (no dependency needed for common entities)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const ENTITY_MAP: Record<string, string> = {
|
||||
"&": "&",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
""": '"',
|
||||
"'": "'",
|
||||
"'": "'",
|
||||
" ": " ",
|
||||
"–": "–",
|
||||
"—": "—",
|
||||
"‘": "\u2018",
|
||||
"’": "\u2019",
|
||||
"“": "\u201C",
|
||||
"”": "\u201D",
|
||||
"•": "•",
|
||||
"…": "…",
|
||||
"©": "©",
|
||||
"®": "®",
|
||||
"™": "™",
|
||||
"€": "€",
|
||||
"£": "£",
|
||||
"¥": "¥",
|
||||
"¢": "¢",
|
||||
"°": "°",
|
||||
"×": "×",
|
||||
"÷": "÷",
|
||||
"¶": "¶",
|
||||
"§": "§",
|
||||
"«": "«",
|
||||
"»": "»",
|
||||
};
|
||||
|
||||
function decodeHtmlEntities(text: string): string {
|
||||
// Named entities
|
||||
let result = text.replace(/&[a-zA-Z]+;/g, (entity) => {
|
||||
return ENTITY_MAP[entity.toLowerCase()] ?? entity;
|
||||
});
|
||||
|
||||
// Numeric decimal entities: {
|
||||
result = result.replace(/&#(\d+);/g, (_m, code: string) => {
|
||||
const num = Number.parseInt(code, 10);
|
||||
if (num > 0 && num < 0x110000) {
|
||||
return String.fromCodePoint(num);
|
||||
}
|
||||
return _m;
|
||||
});
|
||||
|
||||
// Numeric hex entities: 💩
|
||||
result = result.replace(/&#x([0-9a-fA-F]+);/g, (_m, hex: string) => {
|
||||
const num = Number.parseInt(hex, 16);
|
||||
if (num > 0 && num < 0x110000) {
|
||||
return String.fromCodePoint(num);
|
||||
}
|
||||
return _m;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user