From f2e03534389a06377271fa219b7cee6a878caef2 Mon Sep 17 00:00:00 2001 From: Julien Herr Date: Mon, 25 May 2026 08:36:54 +0200 Subject: [PATCH] feat(domain): confirmation-email detection service --- src/domain/confirmation.test.ts | 100 ++++++++++++++++++++++++++ src/domain/confirmation.ts | 124 ++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 src/domain/confirmation.test.ts create mode 100644 src/domain/confirmation.ts diff --git a/src/domain/confirmation.test.ts b/src/domain/confirmation.test.ts new file mode 100644 index 0000000..b96208e --- /dev/null +++ b/src/domain/confirmation.test.ts @@ -0,0 +1,100 @@ +import { describe, it, expect } from "vitest"; +import { detectConfirmation } from "./confirmation"; + +describe("detectConfirmation", () => { + it("detects an English confirmation email and returns the confirm link", () => { + const result = detectConfirmation({ + subject: "Please confirm your subscription", + text: "Click the button below to verify your email address.", + links: [ + { + href: "https://news.example.com/confirm?token=abc123", + text: "Confirm subscription", + }, + { href: "https://news.example.com/home", text: "Home" }, + ], + }); + expect(result).not.toBeNull(); + expect(result!.links[0]).toBe( + "https://news.example.com/confirm?token=abc123", + ); + }); + + it("detects a French confirmation email (accent-insensitive)", () => { + const result = detectConfirmation({ + subject: "Confirmez votre inscription", + text: "Cliquez pour activer votre abonnement.", + links: [ + { + href: "https://lettre.example.fr/valider/xyz", + text: "Valider mon inscription", + }, + ], + }); + expect(result).not.toBeNull(); + expect(result!.links[0]).toBe("https://lettre.example.fr/valider/xyz"); + }); + + it("returns null for a normal newsletter with only an unsubscribe link", () => { + const result = detectConfirmation({ + subject: "This week in tech", + text: "Here are the top stories. To stop receiving these, unsubscribe here.", + links: [ + { href: "https://news.example.com/article/42", text: "Read more" }, + { + href: "https://news.example.com/unsubscribe?u=9", + text: "Unsubscribe", + }, + ], + }); + expect(result).toBeNull(); + }); + + it("returns null when no candidate link is present even if the subject matches", () => { + const result = detectConfirmation({ + subject: "Confirm your subscription", + text: "Reply to this email to confirm.", + links: [], + }); + expect(result).toBeNull(); + }); + + it("never treats an unsubscribe link as a confirmation candidate", () => { + const result = detectConfirmation({ + subject: "Confirm your email", + text: "Verify your address.", + links: [ + { href: "https://x.example/verify/abc", text: "Verify email" }, + { href: "https://x.example/unsubscribe", text: "unsubscribe" }, + ], + }); + expect(result).not.toBeNull(); + expect(result!.links).not.toContain("https://x.example/unsubscribe"); + }); + + it("ranks the strongest candidate first and caps at three links", () => { + const result = detectConfirmation({ + subject: "Confirm your subscription", + text: "verify activate", + links: [ + { href: "https://x.example/help", text: "help" }, + { href: "https://x.example/a?token=1", text: "click" }, + { href: "https://x.example/confirm?token=2", text: "Confirm" }, + { href: "https://x.example/activate", text: "Activate account" }, + { href: "https://x.example/verify", text: "Verify" }, + ], + }); + expect(result).not.toBeNull(); + expect(result!.links.length).toBeLessThanOrEqual(3); + expect(result!.links[0]).toBe("https://x.example/confirm?token=2"); + }); + + it("ignores non-http(s) links", () => { + const result = detectConfirmation({ + subject: "Confirm your subscription", + text: "verify", + links: [{ href: "mailto:confirm@x.example", text: "confirm" }], + }); + expect(result).toBeNull(); + }); +}); diff --git a/src/domain/confirmation.ts b/src/domain/confirmation.ts new file mode 100644 index 0000000..44208da --- /dev/null +++ b/src/domain/confirmation.ts @@ -0,0 +1,124 @@ +/** + * Pure detection of "confirm your subscription" emails. No DOM, no I/O — it + * receives already-extracted subject/body text and link tuples (infra parses the + * HTML). This module owns the business knowledge: the multilingual keyword vocab, + * the link-signal patterns, the scoring weights and the threshold. + * + * Returns the ranked candidate confirmation links (top 3) when the combined score + * clears the threshold AND at least one candidate link exists; otherwise null. + * Only http(s) links are ever considered or returned. + */ + +export interface DetectConfirmationInput { + subject: string; + text: string; + links: { href: string; text: string }[]; +} + +export interface ConfirmationResult { + score: number; + links: string[]; +} + +// Confirmation-positive stems, already normalized (lowercased, diacritics stripped). +// EN / FR / DE / ES — extend here to add a language. +const KEYWORDS = [ + "confirm", + "verif", + "activ", + "valid", + "bestatig", + "aktivier", + "opt-in", + "opt in", + "optin", +]; + +// Link URL/anchor signals (normalized). A link matching any → candidate. +const LINK_SIGNALS = [ + "confirm", + "verif", + "activ", + "valid", + "bestatig", + "aktivier", + "optin", + "opt-in", + "double-optin", + "subscription", + "subscribe", + "token=", + "confirm=", + "activation", +]; + +// Negative patterns: a link matching any of these is NEVER a candidate, and these +// tokens are stripped from text before keyword scanning (kills the unsubscribe +// false positive — "unsubscribe" contains "subscribe"). +const NEGATIVE = [ + "unsubscribe", + "desabonn", + "desinscri", + "abbestell", + "opt-out", + "optout", + "list-unsubscribe", +]; + +const THRESHOLD = 3; + +function normalize(s: string): string { + return s.normalize("NFD").replace(/[̀-ͯ]/g, "").toLowerCase(); +} + +function isHttp(href: string): boolean { + return /^https?:\/\//i.test(href.trim()); +} + +function matchesAny(haystack: string, needles: string[]): boolean { + return needles.some((n) => haystack.includes(n)); +} + +function keywordHits(haystack: string): number { + return KEYWORDS.reduce((n, kw) => (haystack.includes(kw) ? n + 1 : n), 0); +} + +function linkScore(href: string, text: string): number { + const h = normalize(href); + const t = normalize(text); + if (matchesAny(h, NEGATIVE) || matchesAny(t, NEGATIVE)) return 0; + let score = 0; + if (matchesAny(h, LINK_SIGNALS)) score += 2; + if (matchesAny(t, KEYWORDS)) score += 2; + return score; +} + +function stripNegatives(text: string): string { + let out = text; + for (const n of NEGATIVE) out = out.split(n).join(" "); + return out; +} + +export function detectConfirmation( + input: DetectConfirmationInput, +): ConfirmationResult | null { + const candidates = input.links + .filter((l) => isHttp(l.href)) + .map((l) => ({ href: l.href.trim(), score: linkScore(l.href, l.text) })) + .filter((l) => l.score > 0) + .sort((a, b) => b.score - a.score); + + if (candidates.length === 0) return null; + + const subject = stripNegatives(normalize(input.subject)); + const text = stripNegatives(normalize(input.text)); + + const subjectScore = keywordHits(subject) > 0 ? 2 : 0; + const bodyScore = keywordHits(text) > 0 ? 1 : 0; + const bestLinkScore = candidates[0].score; + + const score = subjectScore + bodyScore + bestLinkScore; + if (score < THRESHOLD) return null; + + return { score, links: candidates.slice(0, 3).map((c) => c.href) }; +}