From d561b6b81f80c124a9e382e7ba301c6f5526d7b9 Mon Sep 17 00:00:00 2001
From: Julien Herr <julien@herr.fr>
Date: Mon, 25 May 2026 08:55:24 +0200
Subject: [PATCH] feat(infra): extractLinks for confirmation detection

---
 src/infrastructure/html-processor.test.ts | 29 +++++++++++++++++++++++
 src/infrastructure/html-processor.ts      | 28 ++++++++++++++++++++++
 2 files changed, 57 insertions(+)
diff --git a/src/infrastructure/html-processor.test.ts b/src/infrastructure/html-processor.test.ts
index 9dba7cf..1c3d7ec 100644
--- a/src/infrastructure/html-processor.test.ts
+++ b/src/infrastructure/html-processor.test.ts
@@ -3,6 +3,7 @@ import {
   processEmailContent,
   extractInlineCids,
   htmlToText,
+  extractLinks,
 } from "./html-processor";
 import type { AttachmentData } from "../types";
 
@@ -319,3 +320,31 @@ describe("extractInlineCids", () => {
     expect(extractInlineCids("").size).toBe(0);
   });
 });
+
+describe("extractLinks", () => {
+  it("collects anchor href + text from HTML", () => {
+    const links = extractLinks(
+      '<p>hi <a href="https://x.example/confirm?t=1">Confirm</a> and <a href="https://x.example/home">Home</a></p>',
+    );
+    expect(links).toEqual([
+      { href: "https://x.example/confirm?t=1", text: "Confirm" },
+      { href: "https://x.example/home", text: "Home" },
+    ]);
+  });
+
+  it("falls back to regex URL extraction for plain text", () => {
+    const links = extractLinks(
+      "Confirm here: https://x.example/verify/abc thanks",
+    );
+    expect(links).toEqual([
+      {
+        href: "https://x.example/verify/abc",
+        text: "https://x.example/verify/abc",
+      },
+    ]);
+  });
+
+  it("returns an empty array for empty content", () => {
+    expect(extractLinks("")).toEqual([]);
+  });
+});
diff --git a/src/infrastructure/html-processor.ts b/src/infrastructure/html-processor.ts
index ae1d948..6b84803 100644
--- a/src/infrastructure/html-processor.ts
+++ b/src/infrastructure/html-processor.ts
@@ -41,6 +41,34 @@ export function htmlToText(value: string): string {
     .trim();
 }
 
+// Collect the links from an email body for confirmation detection: anchor href +
+// visible text from HTML, or a regex URL sweep for plain-text bodies. Infra owns
+// the DOM parse; the domain detector receives plain tuples.
+export function extractLinks(
+  content: string,
+): { href: string; text: string }[] {
+  if (!content) return [];
+
+  if (isPlainText(content)) {
+    const urls = content.match(/https?:\/\/[^\s<>"')]+/gi) ?? [];
+    return urls.map((url) => ({ href: url, text: url }));
+  }
+
+  const { document } = parseHTML(content);
+  const links: { href: string; text: string }[] = [];
+  document.querySelectorAll("a[href]").forEach((el: Element) => {
+    const href = (el.getAttribute("href") ?? "").trim();
+    if (!href) return;
+    links.push({
+      href,
+      text: ((el as unknown as { textContent?: string }).textContent ?? "")
+        .replace(/\s+/g, " ")
+        .trim(),
+    });
+  });
+  return links;
+}
+
 // Newsletters frequently defer images via data-src/loading="lazy"; readers don't
 // run the lazy-loader, so the image renders blank. Promote the real source.
 function promoteLazyImages(document: ParsedDocument): void {