/**
 * Pure detection of "confirm your subscription" emails. No DOM, no I/O — it
 * receives already-extracted subject/body text and link tuples (infra parses the
 * HTML). This module owns the business knowledge: the multilingual keyword vocab,
 * the link-signal patterns, the scoring weights and the threshold.
 *
 * Returns the ranked candidate confirmation links (top 3) when the combined score
 * clears the threshold AND at least one candidate link exists; otherwise null.
 * Only http(s) links are ever considered or returned.
 */

export interface DetectConfirmationInput {
  subject: string;
  text: string;
  links: { href: string; text: string }[];
}

// Confirmation-positive stems, already normalized (lowercased, diacritics stripped).
// EN / FR / DE / ES — extend here to add a language.
const KEYWORDS = [
  "confirm",
  "verif",
  "activ",
  "valid",
  "bestatig",
  "aktivier",
  "opt-in",
  "opt in",
  "optin",
];

// Strong URL signals: an unambiguous confirm/verify/activate action or a token.
// A link URL matching any scores +2.
const STRONG_LINK_SIGNALS = [
  "confirm",
  "verif",
  "activ",
  "valid",
  "bestatig",
  "aktivier",
  "optin",
  "opt-in",
  "double-optin",
  "token=",
  "confirm=",
  "activation",
];

// Weak URL signals: ambiguous subscribe/subscription words that also appear in
// ordinary "manage subscription" footers. Worth only +1 so they cannot, on their
// own (with a stray body keyword), cross the threshold and cry wolf — but still
// let a genuine "confirm your subscription" subject + a bare /subscribe link pass.
const WEAK_LINK_SIGNALS = ["subscription", "subscribe"];

// Negative patterns: a link matching any of these is NEVER a candidate, and these
// tokens are stripped from text before keyword scanning (kills the unsubscribe
// false positive — "unsubscribe" contains "subscribe").
const NEGATIVE = [
  "unsubscribe",
  "desabonn",
  "desinscri",
  "abbestell",
  "opt-out",
  "optout",
  "list-unsubscribe",
];

const THRESHOLD = 3;

function normalize(s: string): string {
  return s.normalize("NFD").replace(/[̀-ͯ]/g, "").toLowerCase();
}

function isHttp(href: string): boolean {
  return /^https?:\/\//i.test(href.trim());
}

function matchesAny(haystack: string, needles: string[]): boolean {
  return needles.some((n) => haystack.includes(n));
}

function linkScore(href: string, text: string): number {
  const h = normalize(href);
  const t = normalize(text);
  if (matchesAny(h, NEGATIVE) || matchesAny(t, NEGATIVE)) return 0;
  let score = 0;
  if (matchesAny(h, STRONG_LINK_SIGNALS)) score += 2;
  else if (matchesAny(h, WEAK_LINK_SIGNALS)) score += 1;
  if (matchesAny(t, KEYWORDS)) score += 2;
  return score;
}

function stripNegatives(text: string): string {
  let out = text;
  for (const n of NEGATIVE) out = out.split(n).join(" ");
  return out;
}

export function detectConfirmation(
  input: DetectConfirmationInput,
): string[] | null {
  const candidates = input.links
    .filter((l) => isHttp(l.href))
    .map((l) => ({ href: l.href.trim(), score: linkScore(l.href, l.text) }))
    .filter((l) => l.score > 0)
    .sort((a, b) => b.score - a.score);

  if (candidates.length === 0) return null;

  const subject = stripNegatives(normalize(input.subject));
  const text = stripNegatives(normalize(input.text));

  const subjectScore = matchesAny(subject, KEYWORDS) ? 2 : 0;
  const bodyScore = matchesAny(text, KEYWORDS) ? 1 : 0;
  const bestLinkScore = candidates[0].score;

  if (subjectScore + bodyScore + bestLinkScore < THRESHOLD) return null;

  // Dedupe by href before capping, so a link repeated in the body never wastes
  // one of the three surfaced slots.
  return [...new Set(candidates.map((c) => c.href))].slice(0, 3);
}