fix(feed): escape bare ampersands in entry HTML attribute URLs

linkedom escapes & in text nodes but not in attribute values, so URLs
with query strings (?a=1&b=2) serialized with bare ampersands. Valid XML
inside the feed CDATA, but the W3C validator parses the embedded HTML and
warns "Named entity expected. Got none." on <description>/<content:encoded>
(RSS) and <summary>/<content> (Atom). Escape every & not already starting
a valid entity; covers all three formats via processEmailContent.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Julien Herr
2026-05-25 22:49:57 +02:00
parent 5f13126b35
commit 7297e06b94
3 changed files with 36 additions and 1 deletions
+4
View File
@@ -22,6 +22,10 @@ verbatim as the GitHub Release notes — so what you write here is what ships.
cached negatively for 6 hours instead of a full week, so a domain whose icon
was momentarily unavailable (e.g. not yet indexed upstream) is retried on the
next email instead of staying blank for days.
- Feed entry HTML now escapes bare ampersands in attribute URLs (e.g. query
strings like `?a=1&b=2`), clearing the W3C feed validator's "Named entity
expected. Got none." warning and improving interoperability with stricter
feed readers.
## [0.3.1] - 2026-05-25
+19
View File
@@ -104,6 +104,25 @@ describe("processEmailContent — attribute sanitization", () => {
const result = processEmailContent(html);
expect(result).toContain("https://example.com");
});
it("escapes bare ampersands in attribute URLs (W3C feed-valid HTML)", () => {
const html =
'<body><a href="https://example.com/?a=1&b=2&utm_source=x">link</a></body>';
const result = processEmailContent(html);
expect(result).toContain(
"https://example.com/?a=1&amp;b=2&amp;utm_source=x",
);
expect(result).not.toMatch(/&(?!amp;)/);
});
it("does not double-escape existing entities", () => {
const html =
'<body><p>Tom &amp; Jerry &#39; &lt;tag&gt;</p><a href="https://x.com/?q=a&amp;b">l</a></body>';
const result = processEmailContent(html);
expect(result).toContain("Tom &amp; Jerry");
expect(result).not.toContain("&amp;amp;");
expect(result).toContain("?q=a&amp;b");
});
});
describe("processEmailContent — mso style cleanup", () => {
+13 -1
View File
@@ -159,6 +159,18 @@ function isPlainText(content: string): boolean {
return !/<[a-z][\s\S]*>/i.test(content);
}
// linkedom escapes `&` in text nodes but not in attribute values, so a URL like
// `?a=1&b=2` serializes with bare ampersands. That's valid XML inside the feed's
// CDATA, but the W3C feed validator parses the embedded HTML and warns
// ("Named entity expected. Got none."). Escape every `&` that doesn't already
// start a valid entity (named, decimal, or hex) — leaves `&amp;`/`&#39;` intact.
function escapeBareAmpersands(html: string): string {
return html.replace(
/&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#\d+|#x[0-9a-fA-F]+);)/g,
"&amp;",
);
}
function rewriteCidSrc(
el: Element,
cidMap: Map<string, AttachmentData>,
@@ -261,5 +273,5 @@ export function processEmailContent(
// Full documents expose a <body>; bodyless fragments are serialized directly
// so that sanitization and cid rewriting still apply to their nodes.
const body = document.querySelector("body");
return body ? body.innerHTML : document.toString();
return escapeBareAmpersands(body ? body.innerHTML : document.toString());
}