From d561b6b81f80c124a9e382e7ba301c6f5526d7b9 Mon Sep 17 00:00:00 2001 From: Julien Herr Date: Mon, 25 May 2026 08:55:24 +0200 Subject: [PATCH] feat(infra): extractLinks for confirmation detection --- src/infrastructure/html-processor.test.ts | 29 +++++++++++++++++++++++ src/infrastructure/html-processor.ts | 28 ++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/infrastructure/html-processor.test.ts b/src/infrastructure/html-processor.test.ts index 9dba7cf..1c3d7ec 100644 --- a/src/infrastructure/html-processor.test.ts +++ b/src/infrastructure/html-processor.test.ts @@ -3,6 +3,7 @@ import { processEmailContent, extractInlineCids, htmlToText, + extractLinks, } from "./html-processor"; import type { AttachmentData } from "../types"; @@ -319,3 +320,31 @@ describe("extractInlineCids", () => { expect(extractInlineCids("").size).toBe(0); }); }); + +describe("extractLinks", () => { + it("collects anchor href + text from HTML", () => { + const links = extractLinks( + '

hi Confirm and Home

', + ); + expect(links).toEqual([ + { href: "https://x.example/confirm?t=1", text: "Confirm" }, + { href: "https://x.example/home", text: "Home" }, + ]); + }); + + it("falls back to regex URL extraction for plain text", () => { + const links = extractLinks( + "Confirm here: https://x.example/verify/abc thanks", + ); + expect(links).toEqual([ + { + href: "https://x.example/verify/abc", + text: "https://x.example/verify/abc", + }, + ]); + }); + + it("returns an empty array for empty content", () => { + expect(extractLinks("")).toEqual([]); + }); +}); diff --git a/src/infrastructure/html-processor.ts b/src/infrastructure/html-processor.ts index ae1d948..6b84803 100644 --- a/src/infrastructure/html-processor.ts +++ b/src/infrastructure/html-processor.ts @@ -41,6 +41,34 @@ export function htmlToText(value: string): string { .trim(); } +// Collect the links from an email body for confirmation detection: anchor href + +// visible text from HTML, or a regex URL sweep for plain-text bodies. Infra owns +// the DOM parse; the domain detector receives plain tuples. +export function extractLinks( + content: string, +): { href: string; text: string }[] { + if (!content) return []; + + if (isPlainText(content)) { + const urls = content.match(/https?:\/\/[^\s<>"')]+/gi) ?? []; + return urls.map((url) => ({ href: url, text: url })); + } + + const { document } = parseHTML(content); + const links: { href: string; text: string }[] = []; + document.querySelectorAll("a[href]").forEach((el: Element) => { + const href = (el.getAttribute("href") ?? "").trim(); + if (!href) return; + links.push({ + href, + text: ((el as unknown as { textContent?: string }).textContent ?? "") + .replace(/\s+/g, " ") + .trim(), + }); + }); + return links; +} + // Newsletters frequently defer images via data-src/loading="lazy"; readers don't // run the lazy-loader, so the image renders blank. Promote the real source. function promoteLazyImages(document: ParsedDocument): void {