Files
kill-the-news/src/utils/html-processor.ts
T
Julien Herr a29e9ab372 feat: WebSub Atom support, HTML processing via linkedom, W3C badges
WebSub / PubSubHubbub:
- Hub now accepts both /rss/:id and /atom/:id topic URLs
- WebSubSubscription stores format ("rss" | "atom")
- notifySubscribers sends RSS or Atom XML with correct Content-Type
- verifyAndStoreSubscription sends correct topic URL per format
- CI paths-ignore docs/** to skip deploy on docs-only changes

HTML processing (linkedom + escape-html):
- New html-processor.ts: body extraction, script/iframe/object removal,
  event handler + javascript: URL stripping, mso-* style cleanup,
  plain text → <pre> with HTML escaping via escape-html
- feed-generator.ts and entries.ts use processEmailContent

Admin UI:
- W3C validation badges (Atom + RSS) on feed detail page

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-05-22 21:12:22 +02:00

74 lines
2.2 KiB
TypeScript

import { parseHTML } from "linkedom";
import escapeHtml from "escape-html";
function cleanMsoStyles(style: string): string {
return style
.split(";")
.map((p) => p.trim())
.filter((p) => p && !/^mso-/i.test(p))
.join("; ");
}
function isPlainText(content: string): boolean {
return !/<[a-z][\s\S]*>/i.test(content);
}
function sanitizeElement(el: Element): void {
// Snapshot attribute names before mutating (linkedom attributes is array-like)
const attrs = Array.from(
el.attributes as unknown as ArrayLike<{ name: string }>,
).map((a) => a.name);
for (const attr of attrs) {
// Remove event handlers (onclick, onerror, onload, …)
if (/^on/i.test(attr)) {
el.removeAttribute(attr);
continue;
}
// Remove javascript: URLs
if (["href", "src", "action"].includes(attr.toLowerCase())) {
const val = el.getAttribute(attr) ?? "";
if (/^\s*javascript:/i.test(val)) {
el.removeAttribute(attr);
continue;
}
}
}
// Strip mso-* inline style properties (Office HTML noise)
const style = el.getAttribute("style");
if (style !== null) {
const cleaned = cleanMsoStyles(style);
if (cleaned) {
el.setAttribute("style", cleaned);
} else {
el.removeAttribute("style");
}
}
}
/**
* Processes email content for safe display in feeds and entry pages:
* - Detects plain text and wraps it in a <pre> block
* - Extracts the <body> fragment from full HTML documents
* - Removes dangerous elements: <script>, <iframe>, <object>, <embed>
* - Removes event handler attributes and javascript: URLs
* - Strips mso-* inline style properties (Office HTML)
*/
export function processEmailContent(content: string): string {
if (!content) return "";
if (isPlainText(content)) {
return `<pre style="white-space: pre-wrap; word-break: break-word;">${escapeHtml(content)}</pre>`;
}
const { document } = parseHTML(content);
document
.querySelectorAll("script, object, embed, iframe, frame, frameset")
.forEach((el: Element) => el.remove());
document.querySelectorAll("*").forEach((el: Element) => sanitizeElement(el));
const body = document.querySelector("body");
return body ? body.innerHTML : content;
}