mirror of
https://github.com/juherr/kill-the-news.git
synced 2026-06-20 22:03:48 +00:00
feat: reader-rendering correctness + privacy hardening (P1·S batch)
Close the five open P1·S items from TODO.md: - X-Robots-Tag: noindex on rss/atom/entries/files + a /robots.txt - absolutize relative content URLs against the sender's site - promote lazy-loaded images (data-src → src, strip loading="lazy") - strip XML-illegal control chars from generated feeds (keep emoji) - plain-text feed <title> (strip HTML, decode entities) Sender-base derivation lives on the EmailAddress value object (siteBaseUrl) instead of a misplaced favicon helper. Bump to 0.2.1 and document the changes in README + CLAUDE.md. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -313,6 +313,66 @@ describe("generateAtomFeed", () => {
|
||||
expect(result).toContain("Bob");
|
||||
});
|
||||
|
||||
it("renders the subject as plain text in <title> (strips tags, decodes entities)", () => {
|
||||
const emailWithHtmlSubject: EmailData = {
|
||||
...mockEmails[0],
|
||||
subject: "<b>Sale</b> Tom & Jerry",
|
||||
};
|
||||
const result = generateAtomFeed(
|
||||
mockFeedConfig,
|
||||
[emailWithHtmlSubject],
|
||||
BASE_URL,
|
||||
FEED_ID,
|
||||
);
|
||||
// Tags are stripped and entities decoded; markup must not survive.
|
||||
expect(result).toContain("Sale Tom & Jerry");
|
||||
expect(result).not.toContain("<b>Sale</b>");
|
||||
});
|
||||
|
||||
it("strips XML-illegal control characters from the output", () => {
|
||||
const emailWithControlChar: EmailData = {
|
||||
...mockEmails[0],
|
||||
subject: "Bad\x00\x1Fchar",
|
||||
content: "<p>body\x0Bhere</p>",
|
||||
};
|
||||
const result = generateAtomFeed(
|
||||
mockFeedConfig,
|
||||
[emailWithControlChar],
|
||||
BASE_URL,
|
||||
FEED_ID,
|
||||
);
|
||||
expect(result).not.toMatch(/[\x00\x0B\x1F]/);
|
||||
});
|
||||
|
||||
it("preserves emoji (surrogate pairs) in the output", () => {
|
||||
const emailWithEmoji: EmailData = {
|
||||
...mockEmails[0],
|
||||
subject: "Launch 🚀 today",
|
||||
};
|
||||
const result = generateAtomFeed(
|
||||
mockFeedConfig,
|
||||
[emailWithEmoji],
|
||||
BASE_URL,
|
||||
FEED_ID,
|
||||
);
|
||||
expect(result).toContain("🚀");
|
||||
});
|
||||
|
||||
it("absolutizes relative content URLs against the sender domain", () => {
|
||||
const emailWithRelative: EmailData = {
|
||||
...mockEmails[0],
|
||||
from: "News <news@acme.com>",
|
||||
content: '<body><a href="/article">read</a></body>',
|
||||
};
|
||||
const result = generateAtomFeed(
|
||||
mockFeedConfig,
|
||||
[emailWithRelative],
|
||||
BASE_URL,
|
||||
FEED_ID,
|
||||
);
|
||||
expect(result).toContain("https://acme.com/article");
|
||||
});
|
||||
|
||||
it("includes enclosure link for email with attachment in Atom feed", () => {
|
||||
const result = generateAtomFeed(
|
||||
mockFeedConfig,
|
||||
|
||||
@@ -1,9 +1,18 @@
|
||||
import { Feed } from "feed";
|
||||
import { FeedConfig, EmailData } from "../types";
|
||||
import { processEmailContent } from "./html-processor";
|
||||
import { processEmailContent, htmlToText } from "./html-processor";
|
||||
import { EmailAddress } from "../domain/value-objects/email-address";
|
||||
|
||||
export { processEmailContent as extractBodyContent };
|
||||
|
||||
// XML 1.0 valid chars: #x9 #xA #xD #x20-#xD7FF #xE000-#xFFFD #x10000-#x10FFFF.
|
||||
// A single illegal codepoint fails the whole feed parse in strict readers, so
|
||||
// strip the complement before returning. The `u` flag iterates by code point, so
|
||||
// valid surrogate pairs (emoji, …) survive while lone surrogates are removed.
|
||||
function stripInvalidXmlChars(xml: string): string {
|
||||
return xml.replace(/[^\x09\x0A\x0D\x20--�\u{10000}-\u{10FFFF}]/gu, "");
|
||||
}
|
||||
|
||||
function parseFromAddress(from: string): { name: string; email?: string } {
|
||||
const match = from.match(/^(.*?)\s*<([^>]+)>\s*$/);
|
||||
if (match) {
|
||||
@@ -60,9 +69,10 @@ function buildFeed(
|
||||
email.content,
|
||||
email.attachments,
|
||||
baseUrl,
|
||||
EmailAddress.parse(email.from)?.siteBaseUrl() ?? "",
|
||||
);
|
||||
feed.addItem({
|
||||
title: email.subject,
|
||||
title: htmlToText(email.subject),
|
||||
id: entryUrl,
|
||||
link: entryUrl,
|
||||
description: bodyContent,
|
||||
@@ -89,13 +99,15 @@ export function generateRssFeed(
|
||||
feedId: string,
|
||||
selfUrl?: string,
|
||||
): string {
|
||||
return buildFeed(
|
||||
feedConfig,
|
||||
emails,
|
||||
baseUrl,
|
||||
feedId,
|
||||
selfUrl ? { rss: selfUrl } : undefined,
|
||||
).rss2();
|
||||
return stripInvalidXmlChars(
|
||||
buildFeed(
|
||||
feedConfig,
|
||||
emails,
|
||||
baseUrl,
|
||||
feedId,
|
||||
selfUrl ? { rss: selfUrl } : undefined,
|
||||
).rss2(),
|
||||
);
|
||||
}
|
||||
|
||||
export function generateAtomFeed(
|
||||
@@ -105,11 +117,13 @@ export function generateAtomFeed(
|
||||
feedId: string,
|
||||
selfUrl?: string,
|
||||
): string {
|
||||
return buildFeed(
|
||||
feedConfig,
|
||||
emails,
|
||||
baseUrl,
|
||||
feedId,
|
||||
selfUrl ? { atom: selfUrl } : undefined,
|
||||
).atom1();
|
||||
return stripInvalidXmlChars(
|
||||
buildFeed(
|
||||
feedConfig,
|
||||
emails,
|
||||
baseUrl,
|
||||
feedId,
|
||||
selfUrl ? { atom: selfUrl } : undefined,
|
||||
).atom1(),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { processEmailContent, extractInlineCids } from "./html-processor";
|
||||
import {
|
||||
processEmailContent,
|
||||
extractInlineCids,
|
||||
htmlToText,
|
||||
} from "./html-processor";
|
||||
import type { AttachmentData } from "../types";
|
||||
|
||||
describe("processEmailContent — body extraction", () => {
|
||||
@@ -197,6 +201,105 @@ describe("processEmailContent — inline cid: rewriting", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("processEmailContent — lazy image promotion", () => {
|
||||
it("promotes data-src to src when src is missing", () => {
|
||||
const html = '<body><img data-src="https://x.com/a.png"/></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain('src="https://x.com/a.png"');
|
||||
});
|
||||
|
||||
it("promotes data-src over a data: placeholder src", () => {
|
||||
const html =
|
||||
'<body><img src="data:image/gif;base64,AAAA" data-src="https://x.com/a.png"/></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain('src="https://x.com/a.png"');
|
||||
expect(result).not.toContain("data:image/gif");
|
||||
});
|
||||
|
||||
it("does not clobber a real src with data-src", () => {
|
||||
const html =
|
||||
'<body><img src="https://real.com/a.png" data-src="https://lazy.com/b.png"/></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain('src="https://real.com/a.png"');
|
||||
});
|
||||
|
||||
it("promotes data-srcset when srcset is absent", () => {
|
||||
const html = '<body><img data-srcset="https://x.com/a.png 2x"/></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain('srcset="https://x.com/a.png 2x"');
|
||||
});
|
||||
|
||||
it("strips loading=lazy", () => {
|
||||
const html = '<body><img src="https://x.com/a.png" loading="lazy"/></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).not.toContain("loading");
|
||||
});
|
||||
});
|
||||
|
||||
describe("processEmailContent — relative URL absolutization", () => {
|
||||
const base = "https://news.example.com/";
|
||||
|
||||
it("absolutizes a root-relative href against the sender base", () => {
|
||||
const html = '<body><a href="/path">link</a></body>';
|
||||
const result = processEmailContent(html, undefined, "", base);
|
||||
expect(result).toContain('href="https://news.example.com/path"');
|
||||
});
|
||||
|
||||
it("absolutizes a relative img src against the sender base", () => {
|
||||
const html = '<body><img src="img/a.png"/></body>';
|
||||
const result = processEmailContent(html, undefined, "", base);
|
||||
expect(result).toContain('src="https://news.example.com/img/a.png"');
|
||||
});
|
||||
|
||||
it("resolves protocol-relative URLs using https", () => {
|
||||
const html = '<body><img src="//cdn.example.com/a.png"/></body>';
|
||||
const result = processEmailContent(html, undefined, "", base);
|
||||
expect(result).toContain('src="https://cdn.example.com/a.png"');
|
||||
});
|
||||
|
||||
it("leaves absolute URLs unchanged", () => {
|
||||
const html = '<body><a href="https://other.com/x">l</a></body>';
|
||||
const result = processEmailContent(html, undefined, "", base);
|
||||
expect(result).toContain('href="https://other.com/x"');
|
||||
});
|
||||
|
||||
it("does not touch relative URLs when no sender base is given", () => {
|
||||
const html = '<body><a href="/path">link</a></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain('href="/path"');
|
||||
});
|
||||
|
||||
it("does not absolutize mailto: or anchors", () => {
|
||||
const html =
|
||||
'<body><a href="mailto:x@y.com">m</a><a href="#top">t</a></body>';
|
||||
const result = processEmailContent(html, undefined, "", base);
|
||||
expect(result).toContain('href="mailto:x@y.com"');
|
||||
expect(result).toContain('href="#top"');
|
||||
});
|
||||
});
|
||||
|
||||
describe("htmlToText", () => {
|
||||
it("strips HTML tags", () => {
|
||||
expect(htmlToText("<b>Bold</b> text")).toBe("Bold text");
|
||||
});
|
||||
|
||||
it("decodes HTML entities", () => {
|
||||
expect(htmlToText("Tom & Jerry <3")).toBe("Tom & Jerry <3");
|
||||
});
|
||||
|
||||
it("collapses whitespace and trims", () => {
|
||||
expect(htmlToText(" a\n\n b ")).toBe("a b");
|
||||
});
|
||||
|
||||
it("returns empty string for empty input", () => {
|
||||
expect(htmlToText("")).toBe("");
|
||||
});
|
||||
|
||||
it("leaves plain text untouched", () => {
|
||||
expect(htmlToText("Just a subject")).toBe("Just a subject");
|
||||
});
|
||||
});
|
||||
|
||||
describe("extractInlineCids", () => {
|
||||
it("collects normalized cids referenced by cid: image sources", () => {
|
||||
const html = '<body><img src="cid:ii_abc"/><img src="CID:ii_def"/></body>';
|
||||
|
||||
@@ -2,6 +2,8 @@ import { parseHTML } from "linkedom";
|
||||
import escapeHtml from "escape-html";
|
||||
import type { AttachmentData } from "../types";
|
||||
|
||||
type ParsedDocument = ReturnType<typeof parseHTML>["document"];
|
||||
|
||||
// Strip surrounding angle brackets and whitespace from a Content-ID so that a
|
||||
// stored value like "<ii_mpi85rqy0>" matches an HTML reference "cid:ii_mpi85rqy0".
|
||||
export function normalizeCid(
|
||||
@@ -28,6 +30,66 @@ export function extractInlineCids(content: string): Set<string> {
|
||||
return cids;
|
||||
}
|
||||
|
||||
// Render an HTML fragment (or already-plain string) down to plain text: strips
|
||||
// tags and decodes entities. Used for feed <title>s, which must be plain text —
|
||||
// raw markup/entities show literally in readers.
|
||||
export function htmlToText(value: string): string {
|
||||
if (!value) return "";
|
||||
const { document } = parseHTML(`<body>${value}</body>`);
|
||||
return (document.documentElement?.textContent ?? "")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
// Newsletters frequently defer images via data-src/loading="lazy"; readers don't
|
||||
// run the lazy-loader, so the image renders blank. Promote the real source.
|
||||
function promoteLazyImages(document: ParsedDocument): void {
|
||||
document.querySelectorAll("img").forEach((img: Element) => {
|
||||
const lazySrc =
|
||||
img.getAttribute("data-src") ||
|
||||
img.getAttribute("data-original") ||
|
||||
img.getAttribute("data-lazy-src");
|
||||
if (lazySrc) {
|
||||
const current = (img.getAttribute("src") ?? "").trim();
|
||||
if (!current || /^data:/i.test(current)) {
|
||||
img.setAttribute("src", lazySrc);
|
||||
}
|
||||
}
|
||||
const lazySrcset = img.getAttribute("data-srcset");
|
||||
if (lazySrcset && !img.getAttribute("srcset")) {
|
||||
img.setAttribute("srcset", lazySrcset);
|
||||
}
|
||||
img.removeAttribute("loading");
|
||||
});
|
||||
}
|
||||
|
||||
// Resolve a single URL against the sender base. Returns null for values that are
|
||||
// already absolute or should never be rewritten (mailto:, data:, cid:, anchors).
|
||||
function toAbsolute(value: string, base: string): string | null {
|
||||
const v = value.trim();
|
||||
if (!v || /^(https?:|mailto:|tel:|data:|cid:|#)/i.test(v)) return null;
|
||||
try {
|
||||
return new URL(v, base).href;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Most readers ignore xml:base, so relative href/src in content break. Absolutize
|
||||
// them against the sender's site (best-effort, derived from its email domain).
|
||||
// Protocol-relative //host/x are resolved too (they pick up the base's https:).
|
||||
function absolutizeUrls(document: ParsedDocument, base: string): void {
|
||||
if (!base) return;
|
||||
document.querySelectorAll("a[href], area[href]").forEach((el: Element) => {
|
||||
const abs = toAbsolute(el.getAttribute("href") ?? "", base);
|
||||
if (abs) el.setAttribute("href", abs);
|
||||
});
|
||||
document.querySelectorAll("img[src]").forEach((el: Element) => {
|
||||
const abs = toAbsolute(el.getAttribute("src") ?? "", base);
|
||||
if (abs) el.setAttribute("src", abs);
|
||||
});
|
||||
}
|
||||
|
||||
function cleanMsoStyles(style: string): string {
|
||||
return style
|
||||
.split(";")
|
||||
@@ -98,11 +160,15 @@ function sanitizeElement(el: Element): void {
|
||||
* - Rewrites inline cid: image refs to the stored attachment URL. baseUrl=""
|
||||
* yields relative URLs (entry page, same origin); a baseUrl yields absolute
|
||||
* URLs (feeds, for external RSS readers).
|
||||
* - Promotes lazy-loaded images (data-src → src, strips loading="lazy").
|
||||
* - Absolutizes relative href/src against senderBaseUrl (the sender's site,
|
||||
* best-effort) so links/images don't break in readers that ignore xml:base.
|
||||
*/
|
||||
export function processEmailContent(
|
||||
content: string,
|
||||
attachments?: AttachmentData[],
|
||||
baseUrl = "",
|
||||
senderBaseUrl = "",
|
||||
): string {
|
||||
if (!content) return "";
|
||||
|
||||
@@ -124,6 +190,11 @@ export function processEmailContent(
|
||||
|
||||
document.querySelectorAll("*").forEach((el: Element) => sanitizeElement(el));
|
||||
|
||||
promoteLazyImages(document);
|
||||
// Absolutize first: cid: refs are skipped here (not http(s)), then rewritten
|
||||
// below to our /files/ URL — which must NOT be absolutized to the sender.
|
||||
absolutizeUrls(document, senderBaseUrl);
|
||||
|
||||
if (cidMap.size > 0) {
|
||||
document
|
||||
.querySelectorAll("[src]")
|
||||
|
||||
Reference in New Issue
Block a user