import { parseHTML } from "linkedom"; import escapeHtml from "escape-html"; import type { AttachmentData } from "../types"; type ParsedDocument = ReturnType["document"]; // Strip surrounding angle brackets and whitespace from a Content-ID so that a // stored value like "" matches an HTML reference "cid:ii_mpi85rqy0". export function normalizeCid( cid: string | null | undefined, ): string | undefined { if (!cid) return undefined; const trimmed = cid.trim().replace(/^<|>$/g, "").trim(); return trimmed || undefined; } // Collect the normalized Content-IDs referenced by `cid:` image sources in the // email body — exactly the set rewriteCidSrc would turn into inline URLs. // Used at ingest to flag those attachments as inline (rendered in place, hidden // from the downloadable attachment lists). export function extractInlineCids(content: string): Set { const cids = new Set(); if (!content || isPlainText(content)) return cids; const { document } = parseHTML(content); document.querySelectorAll("[src]").forEach((el: Element) => { const match = (el.getAttribute("src") ?? "").match(/^\s*cid:(.+)$/i); const cid = match ? normalizeCid(match[1]) : undefined; if (cid) cids.add(cid); }); return cids; } // Render an HTML fragment (or already-plain string) down to plain text: strips // tags and decodes entities. Used for feed s, which must be plain text — // raw markup/entities show literally in readers. export function htmlToText(value: string): string { if (!value) return ""; const { document } = parseHTML(`<body>${value}</body>`); return (document.documentElement?.textContent ?? "") .replace(/\s+/g, " ") .trim(); } // Collect the links from an email body for confirmation detection: anchor href + // visible text from HTML, or a regex URL sweep for plain-text bodies. Infra owns // the DOM parse; the domain detector receives plain tuples. export function extractLinks( content: string, ): { href: string; text: string }[] { if (!content) return []; if (isPlainText(content)) { const urls = content.match(/https?:\/\/[^\s<>"')]+/gi) ?? []; return urls.map((url) => ({ href: url, text: url })); } const { document } = parseHTML(content); const links: { href: string; text: string }[] = []; document.querySelectorAll("a[href]").forEach((el: Element) => { const href = (el.getAttribute("href") ?? "").trim(); if (!href) return; links.push({ href, text: ((el as unknown as { textContent?: string }).textContent ?? "") .replace(/\s+/g, " ") .trim(), }); }); return links; } // Collect a newsletter's self-advertised feed declarations from // <link rel="alternate" type="…"> tags. Returns raw href+type tuples; the // domain decides which MIME types count as a feed. Relative hrefs are // absolutized against the sender base (best-effort); only http(s) URLs survive. // Plain-text bodies have no <link> → []. export function extractFeedLinks( content: string, base = "", ): { href: string; type: string }[] { if (!content || isPlainText(content)) return []; const { document } = parseHTML(content); const links: { href: string; type: string }[] = []; document .querySelectorAll('link[rel~="alternate"][type]') .forEach((el: Element) => { const type = (el.getAttribute("type") ?? "").trim(); const rawHref = (el.getAttribute("href") ?? "").trim(); if (!type || !rawHref) return; // toAbsolute() skips already-absolute hrefs (returns null), so keep those as-is. const href = /^https?:\/\//i.test(rawHref) ? rawHref : (toAbsolute(rawHref, base) ?? ""); if (!/^https?:\/\//i.test(href)) return; links.push({ href, type }); }); return links; } // Newsletters frequently defer images via data-src/loading="lazy"; readers don't // run the lazy-loader, so the image renders blank. Promote the real source. function promoteLazyImages(document: ParsedDocument): void { document.querySelectorAll("img").forEach((img: Element) => { const lazySrc = img.getAttribute("data-src") || img.getAttribute("data-original") || img.getAttribute("data-lazy-src"); if (lazySrc) { const current = (img.getAttribute("src") ?? "").trim(); if (!current || /^data:/i.test(current)) { img.setAttribute("src", lazySrc); } } const lazySrcset = img.getAttribute("data-srcset"); if (lazySrcset && !img.getAttribute("srcset")) { img.setAttribute("srcset", lazySrcset); } img.removeAttribute("loading"); }); } // Resolve a single URL against the sender base. Returns null for values that are // already absolute or should never be rewritten (mailto:, data:, cid:, anchors). function toAbsolute(value: string, base: string): string | null { const v = value.trim(); if (!v || /^(https?:|mailto:|tel:|data:|cid:|#)/i.test(v)) return null; try { return new URL(v, base).href; } catch { return null; } } // Most readers ignore xml:base, so relative href/src in content break. Absolutize // them against the sender's site (best-effort, derived from its email domain). // Protocol-relative //host/x are resolved too (they pick up the base's https:). function absolutizeUrls(document: ParsedDocument, base: string): void { if (!base) return; document.querySelectorAll("a[href], area[href]").forEach((el: Element) => { const abs = toAbsolute(el.getAttribute("href") ?? "", base); if (abs) el.setAttribute("href", abs); }); document.querySelectorAll("img[src]").forEach((el: Element) => { const abs = toAbsolute(el.getAttribute("src") ?? "", base); if (abs) el.setAttribute("src", abs); }); } function cleanMsoStyles(style: string): string { return style .split(";") .map((p) => p.trim()) .filter((p) => p && !/^mso-/i.test(p)) .join("; "); } function isPlainText(content: string): boolean { return !/<[a-z][\s\S]*>/i.test(content); } function rewriteCidSrc( el: Element, cidMap: Map<string, AttachmentData>, baseUrl: string, ): void { const src = el.getAttribute("src") ?? ""; const match = src.match(/^\s*cid:(.+)$/i); if (!match) return; const attachment = cidMap.get(normalizeCid(match[1]) ?? ""); if (!attachment) return; el.setAttribute( "src", `${baseUrl}/files/${attachment.id}/${encodeURIComponent(attachment.filename)}`, ); } function sanitizeElement(el: Element): void { // Snapshot attribute names before mutating (linkedom attributes is array-like) const attrs = Array.from( el.attributes as unknown as ArrayLike<{ name: string }>, ).map((a) => a.name); for (const attr of attrs) { // Remove event handlers (onclick, onerror, onload, …) if (/^on/i.test(attr)) { el.removeAttribute(attr); continue; } // Remove javascript: URLs if (["href", "src", "action"].includes(attr.toLowerCase())) { const val = el.getAttribute(attr) ?? ""; if (/^\s*javascript:/i.test(val)) { el.removeAttribute(attr); continue; } } } // Strip mso-* inline style properties (Office HTML noise) const style = el.getAttribute("style"); if (style !== null) { const cleaned = cleanMsoStyles(style); if (cleaned) { el.setAttribute("style", cleaned); } else { el.removeAttribute("style"); } } } /** * Processes email content for safe display in feeds and entry pages: * - Detects plain text and wraps it in a <pre> block * - Extracts the <body> fragment from full HTML documents * - Removes dangerous elements: <script>, <iframe>, <object>, <embed> * - Removes event handler attributes and javascript: URLs * - Strips mso-* inline style properties (Office HTML) * - Rewrites inline cid: image refs to the stored attachment URL. baseUrl="" * yields relative URLs (entry page, same origin); a baseUrl yields absolute * URLs (feeds, for external RSS readers). * - Promotes lazy-loaded images (data-src → src, strips loading="lazy"). * - Absolutizes relative href/src against senderBaseUrl (the sender's site, * best-effort) so links/images don't break in readers that ignore xml:base. */ export function processEmailContent( content: string, attachments?: AttachmentData[], baseUrl = "", senderBaseUrl = "", ): string { if (!content) return ""; if (isPlainText(content)) { return `<pre style="white-space: pre-wrap; word-break: break-word;">${escapeHtml(content)}</pre>`; } const cidMap = new Map<string, AttachmentData>(); for (const att of attachments ?? []) { const cid = normalizeCid(att.contentId); if (cid) cidMap.set(cid, att); } const { document } = parseHTML(content); document .querySelectorAll("script, object, embed, iframe, frame, frameset") .forEach((el: Element) => el.remove()); document.querySelectorAll("*").forEach((el: Element) => sanitizeElement(el)); promoteLazyImages(document); // Absolutize first: cid: refs are skipped here (not http(s)), then rewritten // below to our /files/ URL — which must NOT be absolutized to the sender. absolutizeUrls(document, senderBaseUrl); if (cidMap.size > 0) { document .querySelectorAll("[src]") .forEach((el: Element) => rewriteCidSrc(el, cidMap, baseUrl)); } // Full documents expose a <body>; bodyless fragments are serialized directly // so that sanitization and cid rewriting still apply to their nodes. const body = document.querySelector("body"); return body ? body.innerHTML : document.toString(); }