diff --git a/CHANGELOG.md b/CHANGELOG.md index d4920f7..d3df1e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,10 @@ verbatim as the GitHub Release notes — so what you write here is what ships. cached negatively for 6 hours instead of a full week, so a domain whose icon was momentarily unavailable (e.g. not yet indexed upstream) is retried on the next email instead of staying blank for days. +- Feed entry HTML now escapes bare ampersands in attribute URLs (e.g. query + strings like `?a=1&b=2`), clearing the W3C feed validator's "Named entity + expected. Got none." warning and improving interoperability with stricter + feed readers. ## [0.3.1] - 2026-05-25 diff --git a/src/infrastructure/html-processor.test.ts b/src/infrastructure/html-processor.test.ts index 7a548f4..4c2a043 100644 --- a/src/infrastructure/html-processor.test.ts +++ b/src/infrastructure/html-processor.test.ts @@ -104,6 +104,25 @@ describe("processEmailContent — attribute sanitization", () => { const result = processEmailContent(html); expect(result).toContain("https://example.com"); }); + + it("escapes bare ampersands in attribute URLs (W3C feed-valid HTML)", () => { + const html = + 'link'; + const result = processEmailContent(html); + expect(result).toContain( + "https://example.com/?a=1&b=2&utm_source=x", + ); + expect(result).not.toMatch(/&(?!amp;)/); + }); + + it("does not double-escape existing entities", () => { + const html = + '

Tom & Jerry ' <tag>

l'; + const result = processEmailContent(html); + expect(result).toContain("Tom & Jerry"); + expect(result).not.toContain("&amp;"); + expect(result).toContain("?q=a&b"); + }); }); describe("processEmailContent — mso style cleanup", () => { diff --git a/src/infrastructure/html-processor.ts b/src/infrastructure/html-processor.ts index 551fe22..42c240b 100644 --- a/src/infrastructure/html-processor.ts +++ b/src/infrastructure/html-processor.ts @@ -159,6 +159,18 @@ function isPlainText(content: string): boolean { return !/<[a-z][\s\S]*>/i.test(content); } +// linkedom escapes `&` in text nodes but not in attribute values, so a URL like +// `?a=1&b=2` serializes with bare ampersands. That's valid XML inside the feed's +// CDATA, but the W3C feed validator parses the embedded HTML and warns +// ("Named entity expected. Got none."). Escape every `&` that doesn't already +// start a valid entity (named, decimal, or hex) — leaves `&`/`'` intact. +function escapeBareAmpersands(html: string): string { + return html.replace( + /&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#\d+|#x[0-9a-fA-F]+);)/g, + "&", + ); +} + function rewriteCidSrc( el: Element, cidMap: Map, @@ -261,5 +273,5 @@ export function processEmailContent( // Full documents expose a ; bodyless fragments are serialized directly // so that sanitization and cid rewriting still apply to their nodes. const body = document.querySelector("body"); - return body ? body.innerHTML : document.toString(); + return escapeBareAmpersands(body ? body.innerHTML : document.toString()); }