feat(infra): extractLinks for confirmation detection

2026-06-20 22:03:48 +00:00 · 2026-05-25 08:55:24 +02:00
parent e4e3d62f5a
commit d561b6b81f
2 changed files with 57 additions and 0 deletions
@@ -3,6 +3,7 @@ import {
  processEmailContent,
  extractInlineCids,
  htmlToText,
  extractLinks,
 } from "./html-processor";
 import type { AttachmentData } from "../types";
@@ -319,3 +320,31 @@ describe("extractInlineCids", () => {
    expect(extractInlineCids("").size).toBe(0);
  });
 });
 describe("extractLinks", () => {
  it("collects anchor href + text from HTML", () => {
    const links = extractLinks(
      '<p>hi <a href="https://x.example/confirm?t=1">Confirm</a> and <a href="https://x.example/home">Home</a></p>',
    );
    expect(links).toEqual([
      { href: "https://x.example/confirm?t=1", text: "Confirm" },
      { href: "https://x.example/home", text: "Home" },
    ]);
  });
  it("falls back to regex URL extraction for plain text", () => {
    const links = extractLinks(
      "Confirm here: https://x.example/verify/abc thanks",
    );
    expect(links).toEqual([
      {
        href: "https://x.example/verify/abc",
        text: "https://x.example/verify/abc",
      },
    ]);
  });
  it("returns an empty array for empty content", () => {
    expect(extractLinks("")).toEqual([]);
  });
 });
@@ -41,6 +41,34 @@ export function htmlToText(value: string): string {
    .trim();
 }
 // Collect the links from an email body for confirmation detection: anchor href +
 // visible text from HTML, or a regex URL sweep for plain-text bodies. Infra owns
 // the DOM parse; the domain detector receives plain tuples.
 export function extractLinks(
  content: string,
 ): { href: string; text: string }[] {
  if (!content) return [];
  if (isPlainText(content)) {
    const urls = content.match(/https?:\/\/[^\s<>"')]+/gi) ?? [];
    return urls.map((url) => ({ href: url, text: url }));
  }
  const { document } = parseHTML(content);
  const links: { href: string; text: string }[] = [];
  document.querySelectorAll("a[href]").forEach((el: Element) => {
    const href = (el.getAttribute("href") ?? "").trim();
    if (!href) return;
    links.push({
      href,
      text: ((el as unknown as { textContent?: string }).textContent ?? "")
        .replace(/\s+/g, " ")
        .trim(),
    });
  });
  return links;
 }
 // Newsletters frequently defer images via data-src/loading="lazy"; readers don't
 // run the lazy-loader, so the image renders blank. Promote the real source.
 function promoteLazyImages(document: ParsedDocument): void {