Files
kill-the-news/src/infrastructure/html-processor.ts
T

266 lines
9.4 KiB
TypeScript

import { parseHTML } from "linkedom";
import escapeHtml from "escape-html";
import type { AttachmentData } from "../types";
type ParsedDocument = ReturnType<typeof parseHTML>["document"];
// Strip surrounding angle brackets and whitespace from a Content-ID so that a
// stored value like "<ii_mpi85rqy0>" matches an HTML reference "cid:ii_mpi85rqy0".
export function normalizeCid(
cid: string | null | undefined,
): string | undefined {
if (!cid) return undefined;
const trimmed = cid.trim().replace(/^<|>$/g, "").trim();
return trimmed || undefined;
}
// Collect the normalized Content-IDs referenced by `cid:` image sources in the
// email body — exactly the set rewriteCidSrc would turn into inline <img> URLs.
// Used at ingest to flag those attachments as inline (rendered in place, hidden
// from the downloadable attachment lists).
export function extractInlineCids(content: string): Set<string> {
const cids = new Set<string>();
if (!content || isPlainText(content)) return cids;
const { document } = parseHTML(content);
document.querySelectorAll("[src]").forEach((el: Element) => {
const match = (el.getAttribute("src") ?? "").match(/^\s*cid:(.+)$/i);
const cid = match ? normalizeCid(match[1]) : undefined;
if (cid) cids.add(cid);
});
return cids;
}
// Render an HTML fragment (or already-plain string) down to plain text: strips
// tags and decodes entities. Used for feed <title>s, which must be plain text —
// raw markup/entities show literally in readers.
export function htmlToText(value: string): string {
if (!value) return "";
const { document } = parseHTML(`<body>${value}</body>`);
return (document.documentElement?.textContent ?? "")
.replace(/\s+/g, " ")
.trim();
}
// Collect the links from an email body for confirmation detection: anchor href +
// visible text from HTML, or a regex URL sweep for plain-text bodies. Infra owns
// the DOM parse; the domain detector receives plain tuples.
export function extractLinks(
content: string,
): { href: string; text: string }[] {
if (!content) return [];
if (isPlainText(content)) {
const urls = content.match(/https?:\/\/[^\s<>"')]+/gi) ?? [];
return urls.map((url) => ({ href: url, text: url }));
}
const { document } = parseHTML(content);
const links: { href: string; text: string }[] = [];
document.querySelectorAll("a[href]").forEach((el: Element) => {
const href = (el.getAttribute("href") ?? "").trim();
if (!href) return;
links.push({
href,
text: ((el as unknown as { textContent?: string }).textContent ?? "")
.replace(/\s+/g, " ")
.trim(),
});
});
return links;
}
// Collect a newsletter's self-advertised feed declarations from
// <link rel="alternate" type="…"> tags. Returns raw href+type tuples; the
// domain decides which MIME types count as a feed. Relative hrefs are
// absolutized against the sender base (best-effort); only http(s) URLs survive.
// Plain-text bodies have no <link> → [].
export function extractFeedLinks(
content: string,
base = "",
): { href: string; type: string }[] {
if (!content || isPlainText(content)) return [];
const { document } = parseHTML(content);
const links: { href: string; type: string }[] = [];
document
.querySelectorAll('link[rel~="alternate"][type]')
.forEach((el: Element) => {
const type = (el.getAttribute("type") ?? "").trim();
const rawHref = (el.getAttribute("href") ?? "").trim();
if (!type || !rawHref) return;
// toAbsolute() skips already-absolute hrefs (returns null), so keep those as-is.
const href = /^https?:\/\//i.test(rawHref)
? rawHref
: (toAbsolute(rawHref, base) ?? "");
if (!/^https?:\/\//i.test(href)) return;
links.push({ href, type });
});
return links;
}
// Newsletters frequently defer images via data-src/loading="lazy"; readers don't
// run the lazy-loader, so the image renders blank. Promote the real source.
function promoteLazyImages(document: ParsedDocument): void {
document.querySelectorAll("img").forEach((img: Element) => {
const lazySrc =
img.getAttribute("data-src") ||
img.getAttribute("data-original") ||
img.getAttribute("data-lazy-src");
if (lazySrc) {
const current = (img.getAttribute("src") ?? "").trim();
if (!current || /^data:/i.test(current)) {
img.setAttribute("src", lazySrc);
}
}
const lazySrcset = img.getAttribute("data-srcset");
if (lazySrcset && !img.getAttribute("srcset")) {
img.setAttribute("srcset", lazySrcset);
}
img.removeAttribute("loading");
});
}
// Resolve a single URL against the sender base. Returns null for values that are
// already absolute or should never be rewritten (mailto:, data:, cid:, anchors).
function toAbsolute(value: string, base: string): string | null {
const v = value.trim();
if (!v || /^(https?:|mailto:|tel:|data:|cid:|#)/i.test(v)) return null;
try {
return new URL(v, base).href;
} catch {
return null;
}
}
// Most readers ignore xml:base, so relative href/src in content break. Absolutize
// them against the sender's site (best-effort, derived from its email domain).
// Protocol-relative //host/x are resolved too (they pick up the base's https:).
function absolutizeUrls(document: ParsedDocument, base: string): void {
if (!base) return;
document.querySelectorAll("a[href], area[href]").forEach((el: Element) => {
const abs = toAbsolute(el.getAttribute("href") ?? "", base);
if (abs) el.setAttribute("href", abs);
});
document.querySelectorAll("img[src]").forEach((el: Element) => {
const abs = toAbsolute(el.getAttribute("src") ?? "", base);
if (abs) el.setAttribute("src", abs);
});
}
function cleanMsoStyles(style: string): string {
return style
.split(";")
.map((p) => p.trim())
.filter((p) => p && !/^mso-/i.test(p))
.join("; ");
}
function isPlainText(content: string): boolean {
return !/<[a-z][\s\S]*>/i.test(content);
}
function rewriteCidSrc(
el: Element,
cidMap: Map<string, AttachmentData>,
baseUrl: string,
): void {
const src = el.getAttribute("src") ?? "";
const match = src.match(/^\s*cid:(.+)$/i);
if (!match) return;
const attachment = cidMap.get(normalizeCid(match[1]) ?? "");
if (!attachment) return;
el.setAttribute(
"src",
`${baseUrl}/files/${attachment.id}/${encodeURIComponent(attachment.filename)}`,
);
}
function sanitizeElement(el: Element): void {
// Snapshot attribute names before mutating (linkedom attributes is array-like)
const attrs = Array.from(
el.attributes as unknown as ArrayLike<{ name: string }>,
).map((a) => a.name);
for (const attr of attrs) {
// Remove event handlers (onclick, onerror, onload, …)
if (/^on/i.test(attr)) {
el.removeAttribute(attr);
continue;
}
// Remove javascript: URLs
if (["href", "src", "action"].includes(attr.toLowerCase())) {
const val = el.getAttribute(attr) ?? "";
if (/^\s*javascript:/i.test(val)) {
el.removeAttribute(attr);
continue;
}
}
}
// Strip mso-* inline style properties (Office HTML noise)
const style = el.getAttribute("style");
if (style !== null) {
const cleaned = cleanMsoStyles(style);
if (cleaned) {
el.setAttribute("style", cleaned);
} else {
el.removeAttribute("style");
}
}
}
/**
* Processes email content for safe display in feeds and entry pages:
* - Detects plain text and wraps it in a <pre> block
* - Extracts the <body> fragment from full HTML documents
* - Removes dangerous elements: <script>, <iframe>, <object>, <embed>
* - Removes event handler attributes and javascript: URLs
* - Strips mso-* inline style properties (Office HTML)
* - Rewrites inline cid: image refs to the stored attachment URL. baseUrl=""
* yields relative URLs (entry page, same origin); a baseUrl yields absolute
* URLs (feeds, for external RSS readers).
* - Promotes lazy-loaded images (data-src → src, strips loading="lazy").
* - Absolutizes relative href/src against senderBaseUrl (the sender's site,
* best-effort) so links/images don't break in readers that ignore xml:base.
*/
export function processEmailContent(
content: string,
attachments?: AttachmentData[],
baseUrl = "",
senderBaseUrl = "",
): string {
if (!content) return "";
if (isPlainText(content)) {
return `<pre style="white-space: pre-wrap; word-break: break-word;">${escapeHtml(content)}</pre>`;
}
const cidMap = new Map<string, AttachmentData>();
for (const att of attachments ?? []) {
const cid = normalizeCid(att.contentId);
if (cid) cidMap.set(cid, att);
}
const { document } = parseHTML(content);
document
.querySelectorAll("script, object, embed, iframe, frame, frameset")
.forEach((el: Element) => el.remove());
document.querySelectorAll("*").forEach((el: Element) => sanitizeElement(el));
promoteLazyImages(document);
// Absolutize first: cid: refs are skipped here (not http(s)), then rewritten
// below to our /files/ URL — which must NOT be absolutized to the sender.
absolutizeUrls(document, senderBaseUrl);
if (cidMap.size > 0) {
document
.querySelectorAll("[src]")
.forEach((el: Element) => rewriteCidSrc(el, cidMap, baseUrl));
}
// Full documents expose a <body>; bodyless fragments are serialized directly
// so that sanitization and cid rewriting still apply to their nodes.
const body = document.querySelector("body");
return body ? body.innerHTML : document.toString();
}