feat: reader-rendering correctness + privacy hardening (P1·S batch)

Close the five open P1·S items from TODO.md:
- X-Robots-Tag: noindex on rss/atom/entries/files + a /robots.txt
- absolutize relative content URLs against the sender's site
- promote lazy-loaded images (data-src → src, strip loading="lazy")
- strip XML-illegal control chars from generated feeds (keep emoji)
- plain-text feed <title> (strip HTML, decode entities)

Sender-base derivation lives on the EmailAddress value object
(siteBaseUrl) instead of a misplaced favicon helper. Bump to 0.2.1
and document the changes in README + CLAUDE.md.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Julien Herr
2026-05-24 17:47:46 +02:00
parent 81e46c9026
commit 97ce9a62b4
20 changed files with 414 additions and 29 deletions
@@ -24,4 +24,10 @@ describe("EmailAddress", () => {
expect(EmailAddress.parse("not an email")).toBeNull();
expect(EmailAddress.parse("")).toBeNull();
});
it("derives the sender site base URL from the domain", () => {
expect(EmailAddress.parse("News <a@Example.com>")?.siteBaseUrl()).toBe(
"https://example.com/",
);
});
});
@@ -20,6 +20,15 @@ export class EmailAddress {
return new EmailAddress(`${local}@${domain.value}`, domain);
}
/**
* Best-effort website origin implied by the sender's domain
* (e.g. `https://example.com/`). Used to absolutize relative links in the
* email body — the sender's site is the only base we can infer.
*/
siteBaseUrl(): string {
return `https://${this.domain.value}/`;
}
toString(): string {
return this.normalized;
}
+14
View File
@@ -54,3 +54,17 @@ describe("CORS middleware", () => {
expect(res.headers.get("Access-Control-Allow-Origin")).toBe("*");
});
});
describe("GET /robots.txt", () => {
it("returns 200 and disallows the private feed/entry paths", async () => {
const res = await worker.fetch(req("/robots.txt"), env as unknown as Env);
expect(res.status).toBe(200);
const body = await res.text();
expect(body).toContain("User-agent: *");
expect(body).toContain("Disallow: /rss/");
expect(body).toContain("Disallow: /atom/");
expect(body).toContain("Disallow: /entries/");
expect(body).toContain("Disallow: /files/");
expect(body).toContain("Disallow: /admin/");
});
});
+8
View File
@@ -184,6 +184,14 @@ app.get("/health", (c) => c.json({ status: "ok", timestamp: Date.now() }));
// Public status page (counters + link to admin)
app.get("/", handleHome);
// Keep private feeds/emails out of search engines (defense in depth alongside
// the X-Robots-Tag headers on the feed/entry/file responses).
app.get("/robots.txt", (c) =>
c.text(
"User-agent: *\nDisallow: /rss/\nDisallow: /atom/\nDisallow: /entries/\nDisallow: /files/\nDisallow: /admin/\n",
),
);
// Catch-all for 404s
app.all("*", (c) => c.text("Not Found", 404));
+60
View File
@@ -313,6 +313,66 @@ describe("generateAtomFeed", () => {
expect(result).toContain("Bob");
});
it("renders the subject as plain text in <title> (strips tags, decodes entities)", () => {
const emailWithHtmlSubject: EmailData = {
...mockEmails[0],
subject: "<b>Sale</b> Tom &amp; Jerry",
};
const result = generateAtomFeed(
mockFeedConfig,
[emailWithHtmlSubject],
BASE_URL,
FEED_ID,
);
// Tags are stripped and entities decoded; markup must not survive.
expect(result).toContain("Sale Tom & Jerry");
expect(result).not.toContain("<b>Sale</b>");
});
it("strips XML-illegal control characters from the output", () => {
const emailWithControlChar: EmailData = {
...mockEmails[0],
subject: "Bad\x00\x1Fchar",
content: "<p>body\x0Bhere</p>",
};
const result = generateAtomFeed(
mockFeedConfig,
[emailWithControlChar],
BASE_URL,
FEED_ID,
);
expect(result).not.toMatch(/[\x00\x0B\x1F]/);
});
it("preserves emoji (surrogate pairs) in the output", () => {
const emailWithEmoji: EmailData = {
...mockEmails[0],
subject: "Launch 🚀 today",
};
const result = generateAtomFeed(
mockFeedConfig,
[emailWithEmoji],
BASE_URL,
FEED_ID,
);
expect(result).toContain("🚀");
});
it("absolutizes relative content URLs against the sender domain", () => {
const emailWithRelative: EmailData = {
...mockEmails[0],
from: "News <news@acme.com>",
content: '<body><a href="/article">read</a></body>',
};
const result = generateAtomFeed(
mockFeedConfig,
[emailWithRelative],
BASE_URL,
FEED_ID,
);
expect(result).toContain("https://acme.com/article");
});
it("includes enclosure link for email with attachment in Atom feed", () => {
const result = generateAtomFeed(
mockFeedConfig,
+30 -16
View File
@@ -1,9 +1,18 @@
import { Feed } from "feed";
import { FeedConfig, EmailData } from "../types";
import { processEmailContent } from "./html-processor";
import { processEmailContent, htmlToText } from "./html-processor";
import { EmailAddress } from "../domain/value-objects/email-address";
export { processEmailContent as extractBodyContent };
// XML 1.0 valid chars: #x9 #xA #xD #x20-#xD7FF #xE000-#xFFFD #x10000-#x10FFFF.
// A single illegal codepoint fails the whole feed parse in strict readers, so
// strip the complement before returning. The `u` flag iterates by code point, so
// valid surrogate pairs (emoji, …) survive while lone surrogates are removed.
function stripInvalidXmlChars(xml: string): string {
return xml.replace(/[^\x09\x0A\x0D\x20--\u{10000}-\u{10FFFF}]/gu, "");
}
function parseFromAddress(from: string): { name: string; email?: string } {
const match = from.match(/^(.*?)\s*<([^>]+)>\s*$/);
if (match) {
@@ -60,9 +69,10 @@ function buildFeed(
email.content,
email.attachments,
baseUrl,
EmailAddress.parse(email.from)?.siteBaseUrl() ?? "",
);
feed.addItem({
title: email.subject,
title: htmlToText(email.subject),
id: entryUrl,
link: entryUrl,
description: bodyContent,
@@ -89,13 +99,15 @@ export function generateRssFeed(
feedId: string,
selfUrl?: string,
): string {
return buildFeed(
feedConfig,
emails,
baseUrl,
feedId,
selfUrl ? { rss: selfUrl } : undefined,
).rss2();
return stripInvalidXmlChars(
buildFeed(
feedConfig,
emails,
baseUrl,
feedId,
selfUrl ? { rss: selfUrl } : undefined,
).rss2(),
);
}
export function generateAtomFeed(
@@ -105,11 +117,13 @@ export function generateAtomFeed(
feedId: string,
selfUrl?: string,
): string {
return buildFeed(
feedConfig,
emails,
baseUrl,
feedId,
selfUrl ? { atom: selfUrl } : undefined,
).atom1();
return stripInvalidXmlChars(
buildFeed(
feedConfig,
emails,
baseUrl,
feedId,
selfUrl ? { atom: selfUrl } : undefined,
).atom1(),
);
}
+104 -1
View File
@@ -1,5 +1,9 @@
import { describe, it, expect } from "vitest";
import { processEmailContent, extractInlineCids } from "./html-processor";
import {
processEmailContent,
extractInlineCids,
htmlToText,
} from "./html-processor";
import type { AttachmentData } from "../types";
describe("processEmailContent — body extraction", () => {
@@ -197,6 +201,105 @@ describe("processEmailContent — inline cid: rewriting", () => {
});
});
describe("processEmailContent — lazy image promotion", () => {
it("promotes data-src to src when src is missing", () => {
const html = '<body><img data-src="https://x.com/a.png"/></body>';
const result = processEmailContent(html);
expect(result).toContain('src="https://x.com/a.png"');
});
it("promotes data-src over a data: placeholder src", () => {
const html =
'<body><img src="data:image/gif;base64,AAAA" data-src="https://x.com/a.png"/></body>';
const result = processEmailContent(html);
expect(result).toContain('src="https://x.com/a.png"');
expect(result).not.toContain("data:image/gif");
});
it("does not clobber a real src with data-src", () => {
const html =
'<body><img src="https://real.com/a.png" data-src="https://lazy.com/b.png"/></body>';
const result = processEmailContent(html);
expect(result).toContain('src="https://real.com/a.png"');
});
it("promotes data-srcset when srcset is absent", () => {
const html = '<body><img data-srcset="https://x.com/a.png 2x"/></body>';
const result = processEmailContent(html);
expect(result).toContain('srcset="https://x.com/a.png 2x"');
});
it("strips loading=lazy", () => {
const html = '<body><img src="https://x.com/a.png" loading="lazy"/></body>';
const result = processEmailContent(html);
expect(result).not.toContain("loading");
});
});
describe("processEmailContent — relative URL absolutization", () => {
const base = "https://news.example.com/";
it("absolutizes a root-relative href against the sender base", () => {
const html = '<body><a href="/path">link</a></body>';
const result = processEmailContent(html, undefined, "", base);
expect(result).toContain('href="https://news.example.com/path"');
});
it("absolutizes a relative img src against the sender base", () => {
const html = '<body><img src="img/a.png"/></body>';
const result = processEmailContent(html, undefined, "", base);
expect(result).toContain('src="https://news.example.com/img/a.png"');
});
it("resolves protocol-relative URLs using https", () => {
const html = '<body><img src="//cdn.example.com/a.png"/></body>';
const result = processEmailContent(html, undefined, "", base);
expect(result).toContain('src="https://cdn.example.com/a.png"');
});
it("leaves absolute URLs unchanged", () => {
const html = '<body><a href="https://other.com/x">l</a></body>';
const result = processEmailContent(html, undefined, "", base);
expect(result).toContain('href="https://other.com/x"');
});
it("does not touch relative URLs when no sender base is given", () => {
const html = '<body><a href="/path">link</a></body>';
const result = processEmailContent(html);
expect(result).toContain('href="/path"');
});
it("does not absolutize mailto: or anchors", () => {
const html =
'<body><a href="mailto:x@y.com">m</a><a href="#top">t</a></body>';
const result = processEmailContent(html, undefined, "", base);
expect(result).toContain('href="mailto:x@y.com"');
expect(result).toContain('href="#top"');
});
});
describe("htmlToText", () => {
it("strips HTML tags", () => {
expect(htmlToText("<b>Bold</b> text")).toBe("Bold text");
});
it("decodes HTML entities", () => {
expect(htmlToText("Tom &amp; Jerry &lt;3")).toBe("Tom & Jerry <3");
});
it("collapses whitespace and trims", () => {
expect(htmlToText(" a\n\n b ")).toBe("a b");
});
it("returns empty string for empty input", () => {
expect(htmlToText("")).toBe("");
});
it("leaves plain text untouched", () => {
expect(htmlToText("Just a subject")).toBe("Just a subject");
});
});
describe("extractInlineCids", () => {
it("collects normalized cids referenced by cid: image sources", () => {
const html = '<body><img src="cid:ii_abc"/><img src="CID:ii_def"/></body>';
+71
View File
@@ -2,6 +2,8 @@ import { parseHTML } from "linkedom";
import escapeHtml from "escape-html";
import type { AttachmentData } from "../types";
type ParsedDocument = ReturnType<typeof parseHTML>["document"];
// Strip surrounding angle brackets and whitespace from a Content-ID so that a
// stored value like "<ii_mpi85rqy0>" matches an HTML reference "cid:ii_mpi85rqy0".
export function normalizeCid(
@@ -28,6 +30,66 @@ export function extractInlineCids(content: string): Set<string> {
return cids;
}
// Render an HTML fragment (or already-plain string) down to plain text: strips
// tags and decodes entities. Used for feed <title>s, which must be plain text —
// raw markup/entities show literally in readers.
export function htmlToText(value: string): string {
if (!value) return "";
const { document } = parseHTML(`<body>${value}</body>`);
return (document.documentElement?.textContent ?? "")
.replace(/\s+/g, " ")
.trim();
}
// Newsletters frequently defer images via data-src/loading="lazy"; readers don't
// run the lazy-loader, so the image renders blank. Promote the real source.
function promoteLazyImages(document: ParsedDocument): void {
document.querySelectorAll("img").forEach((img: Element) => {
const lazySrc =
img.getAttribute("data-src") ||
img.getAttribute("data-original") ||
img.getAttribute("data-lazy-src");
if (lazySrc) {
const current = (img.getAttribute("src") ?? "").trim();
if (!current || /^data:/i.test(current)) {
img.setAttribute("src", lazySrc);
}
}
const lazySrcset = img.getAttribute("data-srcset");
if (lazySrcset && !img.getAttribute("srcset")) {
img.setAttribute("srcset", lazySrcset);
}
img.removeAttribute("loading");
});
}
// Resolve a single URL against the sender base. Returns null for values that are
// already absolute or should never be rewritten (mailto:, data:, cid:, anchors).
function toAbsolute(value: string, base: string): string | null {
const v = value.trim();
if (!v || /^(https?:|mailto:|tel:|data:|cid:|#)/i.test(v)) return null;
try {
return new URL(v, base).href;
} catch {
return null;
}
}
// Most readers ignore xml:base, so relative href/src in content break. Absolutize
// them against the sender's site (best-effort, derived from its email domain).
// Protocol-relative //host/x are resolved too (they pick up the base's https:).
function absolutizeUrls(document: ParsedDocument, base: string): void {
if (!base) return;
document.querySelectorAll("a[href], area[href]").forEach((el: Element) => {
const abs = toAbsolute(el.getAttribute("href") ?? "", base);
if (abs) el.setAttribute("href", abs);
});
document.querySelectorAll("img[src]").forEach((el: Element) => {
const abs = toAbsolute(el.getAttribute("src") ?? "", base);
if (abs) el.setAttribute("src", abs);
});
}
function cleanMsoStyles(style: string): string {
return style
.split(";")
@@ -98,11 +160,15 @@ function sanitizeElement(el: Element): void {
* - Rewrites inline cid: image refs to the stored attachment URL. baseUrl=""
* yields relative URLs (entry page, same origin); a baseUrl yields absolute
* URLs (feeds, for external RSS readers).
* - Promotes lazy-loaded images (data-src → src, strips loading="lazy").
* - Absolutizes relative href/src against senderBaseUrl (the sender's site,
* best-effort) so links/images don't break in readers that ignore xml:base.
*/
export function processEmailContent(
content: string,
attachments?: AttachmentData[],
baseUrl = "",
senderBaseUrl = "",
): string {
if (!content) return "";
@@ -124,6 +190,11 @@ export function processEmailContent(
document.querySelectorAll("*").forEach((el: Element) => sanitizeElement(el));
promoteLazyImages(document);
// Absolutize first: cid: refs are skipped here (not http(s)), then rewritten
// below to our /files/ URL — which must NOT be absolutized to the sender.
absolutizeUrls(document, senderBaseUrl);
if (cidMap.size > 0) {
document
.querySelectorAll("[src]")
+5
View File
@@ -47,6 +47,11 @@ describe("Atom Feed Route", () => {
const res = await testApp.request("/empty-feed", {}, mockEnv);
expect(res.headers.get("Cache-Control")).toBe("max-age=1800");
});
it("sets X-Robots-Tag: noindex", async () => {
const res = await testApp.request("/empty-feed", {}, mockEnv);
expect(res.headers.get("X-Robots-Tag")).toBe("noindex");
});
});
describe("valid feed with emails", () => {
+1
View File
@@ -40,6 +40,7 @@ export async function handle(c: Context<{ Bindings: Env }>): Promise<Response> {
headers: {
"Content-Type": "application/atom+xml",
"Cache-Control": "max-age=1800",
"X-Robots-Tag": "noindex",
Link: linkHeader,
},
});
+7
View File
@@ -170,4 +170,11 @@ describe("GET /entries/:feedId/:entryId", () => {
"default-src 'none'",
);
});
it("sets X-Robots-Tag: noindex", async () => {
await seedFeed(env);
const app = makeApp();
const res = await app.request(`/${FEED_ID}/${RECEIVED_AT}`, {}, env as any);
expect(res.headers.get("X-Robots-Tag")).toBe("noindex");
});
});
+10 -5
View File
@@ -2,6 +2,7 @@ import { Context } from "hono";
import { html, raw } from "hono/html";
import { Env } from "../types";
import { processEmailContent } from "../infrastructure/html-processor";
import { EmailAddress } from "../domain/value-objects/email-address";
import { formatBytes } from "../domain/format";
import { FeedRepository } from "../infrastructure/feed-repository";
import { FeedId } from "../domain/value-objects/feed-id";
@@ -46,6 +47,14 @@ export async function handle(c: Context<{ Bindings: Env }>): Promise<Response> {
"Content-Security-Policy",
"default-src 'none'; style-src 'unsafe-inline'; img-src *; frame-src 'none'",
);
c.header("X-Robots-Tag", "noindex");
const bodyContent = processEmailContent(
emailData.content,
emailData.attachments,
"",
EmailAddress.parse(emailData.from)?.siteBaseUrl() ?? "",
);
// Inline images render in place (cid: refs are rewritten by processEmailContent);
// only genuine, downloadable attachments belong in the list below.
@@ -92,11 +101,7 @@ export async function handle(c: Context<{ Bindings: Env }>): Promise<Response> {
<dt>Date:</dt>
<dd>${new Date(emailData.receivedAt).toUTCString()}</dd>
</dl>
<div class="content">
${raw(
processEmailContent(emailData.content, emailData.attachments),
)}
</div>
<div class="content">${raw(bodyContent)}</div>
${attachmentsSection}
</body>
</html>`,
+10
View File
@@ -72,6 +72,16 @@ describe("GET /files/:attachmentId/:filename", () => {
);
});
it("sets X-Robots-Tag: noindex", async () => {
const content = new TextEncoder().encode("data").buffer as ArrayBuffer;
await mockR2.put("robots-uuid", content, {
httpMetadata: { contentType: "application/pdf" },
});
const res = await request(envWithR2, "/files/robots-uuid/doc.pdf");
expect(res.headers.get("X-Robots-Tag")).toBe("noindex");
});
it("sets Content-Disposition from httpMetadata when present", async () => {
const content = new TextEncoder().encode("data").buffer as ArrayBuffer;
await mockR2.put("disp-uuid", content, {
+1
View File
@@ -25,6 +25,7 @@ export async function handle(c: Context<{ Bindings: Env }>): Promise<Response> {
object.writeHttpMetadata(headers);
headers.set("etag", object.httpEtag);
headers.set("Cache-Control", "public, max-age=31536000, immutable");
headers.set("X-Robots-Tag", "noindex");
if (!headers.get("Content-Disposition")) {
headers.set(
+56
View File
@@ -0,0 +1,56 @@
import { describe, it, expect, beforeEach } from "vitest";
import { Hono } from "hono";
import { handle } from "./rss";
import { createMockEnv } from "../test/setup";
import { Env } from "../types";
describe("RSS Feed Route", () => {
let testApp: Hono;
let mockEnv: Env;
beforeEach(() => {
mockEnv = createMockEnv() as unknown as Env;
testApp = new Hono();
testApp.get("/:feedId", handle);
});
describe("unknown feed", () => {
it("returns 404 when no metadata exists in KV", async () => {
const res = await testApp.request("/nonexistent-feed", {}, mockEnv);
expect(res.status).toBe(404);
expect(await res.text()).toBe("Feed not found");
});
});
describe("valid feed with no emails", () => {
beforeEach(async () => {
await mockEnv.EMAIL_STORAGE.put(
"feed:empty-feed:metadata",
JSON.stringify({ emails: [] }),
);
});
it("returns 200 with application/rss+xml content type", async () => {
const res = await testApp.request("/empty-feed", {}, mockEnv);
expect(res.status).toBe(200);
expect(res.headers.get("Content-Type")).toContain("application/rss+xml");
});
it("includes Cache-Control header", async () => {
const res = await testApp.request("/empty-feed", {}, mockEnv);
expect(res.headers.get("Cache-Control")).toBe("max-age=1800");
});
it("sets X-Robots-Tag: noindex", async () => {
const res = await testApp.request("/empty-feed", {}, mockEnv);
expect(res.headers.get("X-Robots-Tag")).toBe("noindex");
});
it("Link header advertises hub and self for WebSub discovery", async () => {
const res = await testApp.request("/empty-feed", {}, mockEnv);
const link = res.headers.get("Link") ?? "";
expect(link).toContain(`rel="hub"`);
expect(link).toContain(`rel="self"`);
});
});
});
+1
View File
@@ -40,6 +40,7 @@ export async function handle(c: Context<{ Bindings: Env }>): Promise<Response> {
headers: {
"Content-Type": "application/rss+xml",
"Cache-Control": "max-age=1800",
"X-Robots-Tag": "noindex",
Link: linkHeader,
},
});