fix(feed): escape bare ampersands in entry HTML attribute URLs

linkedom escapes & in text nodes but not in attribute values, so URLs
with query strings (?a=1&b=2) serialized with bare ampersands. Valid XML
inside the feed CDATA, but the W3C validator parses the embedded HTML and
warns "Named entity expected. Got none." on <description>/<content:encoded>
(RSS) and <summary>/<content> (Atom). Escape every & not already starting
a valid entity; covers all three formats via processEmailContent.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Julien Herr
2026-05-25 22:49:57 +02:00
parent 5f13126b35
commit 7297e06b94
3 changed files with 36 additions and 1 deletions
+4
View File
@@ -22,6 +22,10 @@ verbatim as the GitHub Release notes — so what you write here is what ships.
cached negatively for 6 hours instead of a full week, so a domain whose icon cached negatively for 6 hours instead of a full week, so a domain whose icon
was momentarily unavailable (e.g. not yet indexed upstream) is retried on the was momentarily unavailable (e.g. not yet indexed upstream) is retried on the
next email instead of staying blank for days. next email instead of staying blank for days.
- Feed entry HTML now escapes bare ampersands in attribute URLs (e.g. query
strings like `?a=1&b=2`), clearing the W3C feed validator's "Named entity
expected. Got none." warning and improving interoperability with stricter
feed readers.
## [0.3.1] - 2026-05-25 ## [0.3.1] - 2026-05-25
+19
View File
@@ -104,6 +104,25 @@ describe("processEmailContent — attribute sanitization", () => {
const result = processEmailContent(html); const result = processEmailContent(html);
expect(result).toContain("https://example.com"); expect(result).toContain("https://example.com");
}); });
it("escapes bare ampersands in attribute URLs (W3C feed-valid HTML)", () => {
const html =
'<body><a href="https://example.com/?a=1&b=2&utm_source=x">link</a></body>';
const result = processEmailContent(html);
expect(result).toContain(
"https://example.com/?a=1&amp;b=2&amp;utm_source=x",
);
expect(result).not.toMatch(/&(?!amp;)/);
});
it("does not double-escape existing entities", () => {
const html =
'<body><p>Tom &amp; Jerry &#39; &lt;tag&gt;</p><a href="https://x.com/?q=a&amp;b">l</a></body>';
const result = processEmailContent(html);
expect(result).toContain("Tom &amp; Jerry");
expect(result).not.toContain("&amp;amp;");
expect(result).toContain("?q=a&amp;b");
});
}); });
describe("processEmailContent — mso style cleanup", () => { describe("processEmailContent — mso style cleanup", () => {
+13 -1
View File
@@ -159,6 +159,18 @@ function isPlainText(content: string): boolean {
return !/<[a-z][\s\S]*>/i.test(content); return !/<[a-z][\s\S]*>/i.test(content);
} }
// linkedom escapes `&` in text nodes but not in attribute values, so a URL like
// `?a=1&b=2` serializes with bare ampersands. That's valid XML inside the feed's
// CDATA, but the W3C feed validator parses the embedded HTML and warns
// ("Named entity expected. Got none."). Escape every `&` that doesn't already
// start a valid entity (named, decimal, or hex) — leaves `&amp;`/`&#39;` intact.
function escapeBareAmpersands(html: string): string {
return html.replace(
/&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#\d+|#x[0-9a-fA-F]+);)/g,
"&amp;",
);
}
function rewriteCidSrc( function rewriteCidSrc(
el: Element, el: Element,
cidMap: Map<string, AttachmentData>, cidMap: Map<string, AttachmentData>,
@@ -261,5 +273,5 @@ export function processEmailContent(
// Full documents expose a <body>; bodyless fragments are serialized directly // Full documents expose a <body>; bodyless fragments are serialized directly
// so that sanitization and cid rewriting still apply to their nodes. // so that sanitization and cid rewriting still apply to their nodes.
const body = document.querySelector("body"); const body = document.querySelector("body");
return body ? body.innerHTML : document.toString(); return escapeBareAmpersands(body ? body.innerHTML : document.toString());
} }