fix(domain): cut confirmation false positives via weak subscribe signal

A "manage subscription" / "subscribe" footer link is now a weak (+1) URL
signal instead of strong (+2), so an ordinary newsletter with a stray body
keyword (active/valid) no longer crosses the detection threshold. A genuine
"confirm your subscription" subject + a bare /subscribe link still passes.
Also dedupe surfaced links. Adds false-positive + recall + dedupe tests.
This commit is contained in:
Julien Herr
2026-05-25 10:58:20 +02:00
parent 421430632e
commit 4e3d378850
2 changed files with 89 additions and 6 deletions
+14 -6
View File
@@ -29,8 +29,9 @@ const KEYWORDS = [
"optin",
];
// Link URL/anchor signals (normalized). A link matching any → candidate.
const LINK_SIGNALS = [
// Strong URL signals: an unambiguous confirm/verify/activate action or a token.
// A link URL matching any scores +2.
const STRONG_LINK_SIGNALS = [
"confirm",
"verif",
"activ",
@@ -40,13 +41,17 @@ const LINK_SIGNALS = [
"optin",
"opt-in",
"double-optin",
"subscription",
"subscribe",
"token=",
"confirm=",
"activation",
];
// Weak URL signals: ambiguous subscribe/subscription words that also appear in
// ordinary "manage subscription" footers. Worth only +1 so they cannot, on their
// own (with a stray body keyword), cross the threshold and cry wolf — but still
// let a genuine "confirm your subscription" subject + a bare /subscribe link pass.
const WEAK_LINK_SIGNALS = ["subscription", "subscribe"];
// Negative patterns: a link matching any of these is NEVER a candidate, and these
// tokens are stripped from text before keyword scanning (kills the unsubscribe
// false positive — "unsubscribe" contains "subscribe").
@@ -79,7 +84,8 @@ function linkScore(href: string, text: string): number {
const t = normalize(text);
if (matchesAny(h, NEGATIVE) || matchesAny(t, NEGATIVE)) return 0;
let score = 0;
if (matchesAny(h, LINK_SIGNALS)) score += 2;
if (matchesAny(h, STRONG_LINK_SIGNALS)) score += 2;
else if (matchesAny(h, WEAK_LINK_SIGNALS)) score += 1;
if (matchesAny(t, KEYWORDS)) score += 2;
return score;
}
@@ -110,5 +116,7 @@ export function detectConfirmation(
if (subjectScore + bodyScore + bestLinkScore < THRESHOLD) return null;
return candidates.slice(0, 3).map((c) => c.href);
// Dedupe by href before capping, so a link repeated in the body never wastes
// one of the three surfaced slots.
return [...new Set(candidates.map((c) => c.href))].slice(0, 3);
}