feat(infra): extractLinks for confirmation detection

2026-06-20 22:03:48 +00:00 · 2026-05-25 08:55:24 +02:00
parent e4e3d62f5a
commit d561b6b81f
2 changed files with 57 additions and 0 deletions
@@ -3,6 +3,7 @@ import {
  processEmailContent,
  extractInlineCids,
  htmlToText,
+  extractLinks,
 } from "./html-processor";
 import type { AttachmentData } from "../types";

@@ -319,3 +320,31 @@ describe("extractInlineCids", () => {
    expect(extractInlineCids("").size).toBe(0);
  });
 });
+
+describe("extractLinks", () => {
+  it("collects anchor href + text from HTML", () => {
+    const links = extractLinks(
+      '<p>hi <a href="https://x.example/confirm?t=1">Confirm</a> and <a href="https://x.example/home">Home</a></p>',
+    );
+    expect(links).toEqual([
+      { href: "https://x.example/confirm?t=1", text: "Confirm" },
+      { href: "https://x.example/home", text: "Home" },
+    ]);
+  });
+
+  it("falls back to regex URL extraction for plain text", () => {
+    const links = extractLinks(
+      "Confirm here: https://x.example/verify/abc thanks",
+    );
+    expect(links).toEqual([
+      {
+        href: "https://x.example/verify/abc",
+        text: "https://x.example/verify/abc",
+      },
+    ]);
+  });
+
+  it("returns an empty array for empty content", () => {
+    expect(extractLinks("")).toEqual([]);
+  });
+});
@@ -41,6 +41,34 @@ export function htmlToText(value: string): string {
    .trim();
 }

+// Collect the links from an email body for confirmation detection: anchor href +
+// visible text from HTML, or a regex URL sweep for plain-text bodies. Infra owns
+// the DOM parse; the domain detector receives plain tuples.
+export function extractLinks(
+  content: string,
+): { href: string; text: string }[] {
+  if (!content) return [];
+
+  if (isPlainText(content)) {
+    const urls = content.match(/https?:\/\/[^\s<>"')]+/gi) ?? [];
+    return urls.map((url) => ({ href: url, text: url }));
+  }
+
+  const { document } = parseHTML(content);
+  const links: { href: string; text: string }[] = [];
+  document.querySelectorAll("a[href]").forEach((el: Element) => {
+    const href = (el.getAttribute("href") ?? "").trim();
+    if (!href) return;
+    links.push({
+      href,
+      text: ((el as unknown as { textContent?: string }).textContent ?? "")
+        .replace(/\s+/g, " ")
+        .trim(),
+    });
+  });
+  return links;
+}
+
 // Newsletters frequently defer images via data-src/loading="lazy"; readers don't
 // run the lazy-loader, so the image renders blank. Promote the real source.
 function promoteLazyImages(document: ParsedDocument): void {