mirror of
https://github.com/juherr/kill-the-news.git
synced 2026-06-20 22:03:48 +00:00
4e3d378850
A "manage subscription" / "subscribe" footer link is now a weak (+1) URL signal instead of strong (+2), so an ordinary newsletter with a stray body keyword (active/valid) no longer crosses the detection threshold. A genuine "confirm your subscription" subject + a bare /subscribe link still passes. Also dedupe surfaced links. Adds false-positive + recall + dedupe tests.
123 lines
3.7 KiB
TypeScript
123 lines
3.7 KiB
TypeScript
/**
|
|
* Pure detection of "confirm your subscription" emails. No DOM, no I/O — it
|
|
* receives already-extracted subject/body text and link tuples (infra parses the
|
|
* HTML). This module owns the business knowledge: the multilingual keyword vocab,
|
|
* the link-signal patterns, the scoring weights and the threshold.
|
|
*
|
|
* Returns the ranked candidate confirmation links (top 3) when the combined score
|
|
* clears the threshold AND at least one candidate link exists; otherwise null.
|
|
* Only http(s) links are ever considered or returned.
|
|
*/
|
|
|
|
export interface DetectConfirmationInput {
|
|
subject: string;
|
|
text: string;
|
|
links: { href: string; text: string }[];
|
|
}
|
|
|
|
// Confirmation-positive stems, already normalized (lowercased, diacritics stripped).
|
|
// EN / FR / DE / ES — extend here to add a language.
|
|
const KEYWORDS = [
|
|
"confirm",
|
|
"verif",
|
|
"activ",
|
|
"valid",
|
|
"bestatig",
|
|
"aktivier",
|
|
"opt-in",
|
|
"opt in",
|
|
"optin",
|
|
];
|
|
|
|
// Strong URL signals: an unambiguous confirm/verify/activate action or a token.
|
|
// A link URL matching any scores +2.
|
|
const STRONG_LINK_SIGNALS = [
|
|
"confirm",
|
|
"verif",
|
|
"activ",
|
|
"valid",
|
|
"bestatig",
|
|
"aktivier",
|
|
"optin",
|
|
"opt-in",
|
|
"double-optin",
|
|
"token=",
|
|
"confirm=",
|
|
"activation",
|
|
];
|
|
|
|
// Weak URL signals: ambiguous subscribe/subscription words that also appear in
|
|
// ordinary "manage subscription" footers. Worth only +1 so they cannot, on their
|
|
// own (with a stray body keyword), cross the threshold and cry wolf — but still
|
|
// let a genuine "confirm your subscription" subject + a bare /subscribe link pass.
|
|
const WEAK_LINK_SIGNALS = ["subscription", "subscribe"];
|
|
|
|
// Negative patterns: a link matching any of these is NEVER a candidate, and these
|
|
// tokens are stripped from text before keyword scanning (kills the unsubscribe
|
|
// false positive — "unsubscribe" contains "subscribe").
|
|
const NEGATIVE = [
|
|
"unsubscribe",
|
|
"desabonn",
|
|
"desinscri",
|
|
"abbestell",
|
|
"opt-out",
|
|
"optout",
|
|
"list-unsubscribe",
|
|
];
|
|
|
|
const THRESHOLD = 3;
|
|
|
|
function normalize(s: string): string {
|
|
return s.normalize("NFD").replace(/[̀-ͯ]/g, "").toLowerCase();
|
|
}
|
|
|
|
function isHttp(href: string): boolean {
|
|
return /^https?:\/\//i.test(href.trim());
|
|
}
|
|
|
|
function matchesAny(haystack: string, needles: string[]): boolean {
|
|
return needles.some((n) => haystack.includes(n));
|
|
}
|
|
|
|
function linkScore(href: string, text: string): number {
|
|
const h = normalize(href);
|
|
const t = normalize(text);
|
|
if (matchesAny(h, NEGATIVE) || matchesAny(t, NEGATIVE)) return 0;
|
|
let score = 0;
|
|
if (matchesAny(h, STRONG_LINK_SIGNALS)) score += 2;
|
|
else if (matchesAny(h, WEAK_LINK_SIGNALS)) score += 1;
|
|
if (matchesAny(t, KEYWORDS)) score += 2;
|
|
return score;
|
|
}
|
|
|
|
function stripNegatives(text: string): string {
|
|
let out = text;
|
|
for (const n of NEGATIVE) out = out.split(n).join(" ");
|
|
return out;
|
|
}
|
|
|
|
export function detectConfirmation(
|
|
input: DetectConfirmationInput,
|
|
): string[] | null {
|
|
const candidates = input.links
|
|
.filter((l) => isHttp(l.href))
|
|
.map((l) => ({ href: l.href.trim(), score: linkScore(l.href, l.text) }))
|
|
.filter((l) => l.score > 0)
|
|
.sort((a, b) => b.score - a.score);
|
|
|
|
if (candidates.length === 0) return null;
|
|
|
|
const subject = stripNegatives(normalize(input.subject));
|
|
const text = stripNegatives(normalize(input.text));
|
|
|
|
const subjectScore = matchesAny(subject, KEYWORDS) ? 2 : 0;
|
|
const bodyScore = matchesAny(text, KEYWORDS) ? 1 : 0;
|
|
const bestLinkScore = candidates[0].score;
|
|
|
|
if (subjectScore + bodyScore + bestLinkScore < THRESHOLD) return null;
|
|
|
|
// Dedupe by href before capping, so a link repeated in the body never wastes
|
|
// one of the three surfaced slots.
|
|
return [...new Set(candidates.map((c) => c.href))].slice(0, 3);
|
|
}
|