From 3f354356101769129a4614a40e8ad2e49a576c5c Mon Sep 17 00:00:00 2001 From: Julien Herr Date: Mon, 25 May 2026 23:35:10 +0200 Subject: [PATCH] fix(confirmation): recognize localized subscribe CTAs in weak link signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The weak link-signal vocabulary was English-only, so a genuine double opt-in whose confirm button reads "Je m'inscris…" over an opaque tracking redirect scored 0 on every link and was missed. Make the weak vocab multilingual (FR/DE/ES) to match the confirmation keywords. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 5 +++++ src/domain/confirmation.test.ts | 21 +++++++++++++++++++++ src/domain/confirmation.ts | 17 ++++++++++++----- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa3d58c..3c8a591 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,11 @@ verbatim as the GitHub Release notes — so what you write here is what ships. ### Fixed +- Subscription-confirmation detection now recognizes localized "subscribe" CTAs. + The weak link-signal vocabulary was English-only (`subscrib`), + so a genuine double opt-in whose confirm button reads "Je m'inscris…" over an + opaque tracking redirect scored 0 on every link and was missed. The weak vocab + is now multilingual (FR/DE/ES) to match the confirmation keywords. - Per-feed favicons no longer fail for senders whose DuckDuckGo icon is a hi-res PNG: the maximum accepted favicon size is raised from 100 KB to 256 KB, so legitimate large icons (~107 KB and up) are cached instead of rejected. diff --git a/src/domain/confirmation.test.ts b/src/domain/confirmation.test.ts index c15ea7e..eb700b3 100644 --- a/src/domain/confirmation.test.ts +++ b/src/domain/confirmation.test.ts @@ -178,6 +178,27 @@ describe("detectConfirmation", () => { expect(result![0]).toContain("click.example.com"); }); + it("detects a French confirm email whose CTA text is a localized 'subscribe' over an opaque tracking href", () => { + // Real-world double opt-in: subject/body clearly confirm, but the + // button's href is an opaque provider redirect (proc.php?…&act=csub — no + // signal) and its visible text "Je m'inscris…" is the French equivalent of + // "subscribe" (a weak signal). The weak vocab must be multilingual like the + // confirmation keywords, otherwise the link scores 0 and the email is missed. + const result = detectConfirmation({ + subject: "[Action requise] Confirme ton inscription", + text: "Avant de confirmer ton inscription, clique ici.", + links: [ + { + href: "https://email.example.com/proc.php?nl=1&f=36&s=abc&act=csub", + text: "Je m'inscris sur la liste d'attente", + }, + { href: "https://www.example.com/", text: "Notre site" }, + ], + }); + expect(result).not.toBeNull(); + expect(result![0]).toContain("proc.php"); + }); + it("dedupes a confirmation link repeated in the body", () => { const result = detectConfirmation({ subject: "Confirm your subscription", diff --git a/src/domain/confirmation.ts b/src/domain/confirmation.ts index fee84ba..e2e77ee 100644 --- a/src/domain/confirmation.ts +++ b/src/domain/confirmation.ts @@ -48,11 +48,18 @@ const STRONG_LINK_SIGNALS = [ // Weak signals: ambiguous subscribe/subscription words that also appear in // ordinary "manage subscription" footers. Matched on the link href OR its visible -// text (a CTA button often reads "Yes, subscribe me…" over an opaque tracking -// redirect). Worth only +1 — and only once, never href+text additively — so they -// cannot, on their own (with a stray body keyword), cross the threshold and cry -// wolf, yet still let a genuine "confirm your subscription" email pass. -const WEAK_LINK_SIGNALS = ["subscription", "subscribe"]; +// text (a CTA button often reads "Yes, subscribe me…" / "Je m'inscris…" over an +// opaque tracking redirect). Worth only +1 — and only once, never href+text +// additively — so they cannot, on their own (with a stray body keyword), cross +// the threshold and cry wolf, yet still let a genuine "confirm your subscription" +// email pass. Multilingual like KEYWORDS (EN / FR / DE / ES) — extend per language. +const WEAK_LINK_SIGNALS = [ + "subscrib", // EN: subscribe / subscription (unsubscribe is caught by NEGATIVE first) + "inscri", // FR: s'inscrire / inscription / je m'inscris + "anmeld", // DE: anmelden / anmeldung + "suscrib", // ES: suscribir / suscripción + "inscrib", // ES: inscribirse / inscripción +]; // Negative patterns: a link matching any of these is NEVER a candidate, and these // tokens are stripped from text before keyword scanning (kills the unsubscribe