fix(feed): escape bare ampersands in entry HTML attribute URLs

linkedom escapes & in text nodes but not in attribute values, so URLs with query strings (?a=1&b=2) serialized with bare ampersands. Valid XML inside the feed CDATA, but the W3C validator parses the embedded HTML and warns "Named entity expected. Got none." on <description>/<content:encoded> (RSS) and <summary>/<content> (Atom). Escape every & not already starting a valid entity; covers all three formats via processEmailContent. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-20 22:03:48 +00:00 · 2026-05-25 22:49:57 +02:00
parent 5f13126b35
commit 7297e06b94
3 changed files with 36 additions and 1 deletions
@@ -22,6 +22,10 @@ verbatim as the GitHub Release notes — so what you write here is what ships.
  cached negatively for 6 hours instead of a full week, so a domain whose icon
  was momentarily unavailable (e.g. not yet indexed upstream) is retried on the
  next email instead of staying blank for days.
 - Feed entry HTML now escapes bare ampersands in attribute URLs (e.g. query
  strings like `?a=1&b=2`), clearing the W3C feed validator's "Named entity
  expected. Got none." warning and improving interoperability with stricter
  feed readers.
 ## [0.3.1] - 2026-05-25
@@ -104,6 +104,25 @@ describe("processEmailContent — attribute sanitization", () => {
    const result = processEmailContent(html);
    expect(result).toContain("https://example.com");
  });
  it("escapes bare ampersands in attribute URLs (W3C feed-valid HTML)", () => {
    const html =
      '<body><a href="https://example.com/?a=1&b=2&utm_source=x">link</a></body>';
    const result = processEmailContent(html);
    expect(result).toContain(
      "https://example.com/?a=1&amp;b=2&amp;utm_source=x",
    );
    expect(result).not.toMatch(/&(?!amp;)/);
  });
  it("does not double-escape existing entities", () => {
    const html =
      '<body><p>Tom &amp; Jerry &#39; &lt;tag&gt;</p><a href="https://x.com/?q=a&amp;b">l</a></body>';
    const result = processEmailContent(html);
    expect(result).toContain("Tom &amp; Jerry");
    expect(result).not.toContain("&amp;amp;");
    expect(result).toContain("?q=a&amp;b");
  });
 });
 describe("processEmailContent — mso style cleanup", () => {
@@ -159,6 +159,18 @@ function isPlainText(content: string): boolean {
  return !/<[a-z][\s\S]*>/i.test(content);
 }
 // linkedom escapes `&` in text nodes but not in attribute values, so a URL like
 // `?a=1&b=2` serializes with bare ampersands. That's valid XML inside the feed's
 // CDATA, but the W3C feed validator parses the embedded HTML and warns
 // ("Named entity expected. Got none."). Escape every `&` that doesn't already
 // start a valid entity (named, decimal, or hex) — leaves `&amp;`/`&#39;` intact.
 function escapeBareAmpersands(html: string): string {
  return html.replace(
    /&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#\d+|#x[0-9a-fA-F]+);)/g,
    "&amp;",
  );
 }
 function rewriteCidSrc(
  el: Element,
  cidMap: Map<string, AttachmentData>,
@@ -261,5 +273,5 @@ export function processEmailContent(
  // Full documents expose a <body>; bodyless fragments are serialized directly
  // so that sanitization and cid rewriting still apply to their nodes.
  const body = document.querySelector("body");
-  return body ? body.innerHTML : document.toString();
+  return escapeBareAmpersands(body ? body.innerHTML : document.toString());
 }