mirror of
https://github.com/juherr/kill-the-news.git
synced 2026-06-20 22:03:48 +00:00
fix(feed): escape bare ampersands in entry HTML attribute URLs
linkedom escapes & in text nodes but not in attribute values, so URLs with query strings (?a=1&b=2) serialized with bare ampersands. Valid XML inside the feed CDATA, but the W3C validator parses the embedded HTML and warns "Named entity expected. Got none." on <description>/<content:encoded> (RSS) and <summary>/<content> (Atom). Escape every & not already starting a valid entity; covers all three formats via processEmailContent. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,10 @@ verbatim as the GitHub Release notes — so what you write here is what ships.
|
||||
cached negatively for 6 hours instead of a full week, so a domain whose icon
|
||||
was momentarily unavailable (e.g. not yet indexed upstream) is retried on the
|
||||
next email instead of staying blank for days.
|
||||
- Feed entry HTML now escapes bare ampersands in attribute URLs (e.g. query
|
||||
strings like `?a=1&b=2`), clearing the W3C feed validator's "Named entity
|
||||
expected. Got none." warning and improving interoperability with stricter
|
||||
feed readers.
|
||||
|
||||
## [0.3.1] - 2026-05-25
|
||||
|
||||
|
||||
@@ -104,6 +104,25 @@ describe("processEmailContent — attribute sanitization", () => {
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain("https://example.com");
|
||||
});
|
||||
|
||||
it("escapes bare ampersands in attribute URLs (W3C feed-valid HTML)", () => {
|
||||
const html =
|
||||
'<body><a href="https://example.com/?a=1&b=2&utm_source=x">link</a></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain(
|
||||
"https://example.com/?a=1&b=2&utm_source=x",
|
||||
);
|
||||
expect(result).not.toMatch(/&(?!amp;)/);
|
||||
});
|
||||
|
||||
it("does not double-escape existing entities", () => {
|
||||
const html =
|
||||
'<body><p>Tom & Jerry ' <tag></p><a href="https://x.com/?q=a&b">l</a></body>';
|
||||
const result = processEmailContent(html);
|
||||
expect(result).toContain("Tom & Jerry");
|
||||
expect(result).not.toContain("&amp;");
|
||||
expect(result).toContain("?q=a&b");
|
||||
});
|
||||
});
|
||||
|
||||
describe("processEmailContent — mso style cleanup", () => {
|
||||
|
||||
@@ -159,6 +159,18 @@ function isPlainText(content: string): boolean {
|
||||
return !/<[a-z][\s\S]*>/i.test(content);
|
||||
}
|
||||
|
||||
// linkedom escapes `&` in text nodes but not in attribute values, so a URL like
|
||||
// `?a=1&b=2` serializes with bare ampersands. That's valid XML inside the feed's
|
||||
// CDATA, but the W3C feed validator parses the embedded HTML and warns
|
||||
// ("Named entity expected. Got none."). Escape every `&` that doesn't already
|
||||
// start a valid entity (named, decimal, or hex) — leaves `&`/`'` intact.
|
||||
function escapeBareAmpersands(html: string): string {
|
||||
return html.replace(
|
||||
/&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#\d+|#x[0-9a-fA-F]+);)/g,
|
||||
"&",
|
||||
);
|
||||
}
|
||||
|
||||
function rewriteCidSrc(
|
||||
el: Element,
|
||||
cidMap: Map<string, AttachmentData>,
|
||||
@@ -261,5 +273,5 @@ export function processEmailContent(
|
||||
// Full documents expose a <body>; bodyless fragments are serialized directly
|
||||
// so that sanitization and cid rewriting still apply to their nodes.
|
||||
const body = document.querySelector("body");
|
||||
return body ? body.innerHTML : document.toString();
|
||||
return escapeBareAmpersands(body ? body.innerHTML : document.toString());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user