mirror of
https://github.com/juherr/kill-the-news.git
synced 2026-06-20 22:03:48 +00:00
feat(infra): extractLinks for confirmation detection
This commit is contained in:
@@ -3,6 +3,7 @@ import {
|
|||||||
processEmailContent,
|
processEmailContent,
|
||||||
extractInlineCids,
|
extractInlineCids,
|
||||||
htmlToText,
|
htmlToText,
|
||||||
|
extractLinks,
|
||||||
} from "./html-processor";
|
} from "./html-processor";
|
||||||
import type { AttachmentData } from "../types";
|
import type { AttachmentData } from "../types";
|
||||||
|
|
||||||
@@ -319,3 +320,31 @@ describe("extractInlineCids", () => {
|
|||||||
expect(extractInlineCids("").size).toBe(0);
|
expect(extractInlineCids("").size).toBe(0);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("extractLinks", () => {
|
||||||
|
it("collects anchor href + text from HTML", () => {
|
||||||
|
const links = extractLinks(
|
||||||
|
'<p>hi <a href="https://x.example/confirm?t=1">Confirm</a> and <a href="https://x.example/home">Home</a></p>',
|
||||||
|
);
|
||||||
|
expect(links).toEqual([
|
||||||
|
{ href: "https://x.example/confirm?t=1", text: "Confirm" },
|
||||||
|
{ href: "https://x.example/home", text: "Home" },
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("falls back to regex URL extraction for plain text", () => {
|
||||||
|
const links = extractLinks(
|
||||||
|
"Confirm here: https://x.example/verify/abc thanks",
|
||||||
|
);
|
||||||
|
expect(links).toEqual([
|
||||||
|
{
|
||||||
|
href: "https://x.example/verify/abc",
|
||||||
|
text: "https://x.example/verify/abc",
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns an empty array for empty content", () => {
|
||||||
|
expect(extractLinks("")).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -41,6 +41,34 @@ export function htmlToText(value: string): string {
|
|||||||
.trim();
|
.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Collect the links from an email body for confirmation detection: anchor href +
|
||||||
|
// visible text from HTML, or a regex URL sweep for plain-text bodies. Infra owns
|
||||||
|
// the DOM parse; the domain detector receives plain tuples.
|
||||||
|
export function extractLinks(
|
||||||
|
content: string,
|
||||||
|
): { href: string; text: string }[] {
|
||||||
|
if (!content) return [];
|
||||||
|
|
||||||
|
if (isPlainText(content)) {
|
||||||
|
const urls = content.match(/https?:\/\/[^\s<>"')]+/gi) ?? [];
|
||||||
|
return urls.map((url) => ({ href: url, text: url }));
|
||||||
|
}
|
||||||
|
|
||||||
|
const { document } = parseHTML(content);
|
||||||
|
const links: { href: string; text: string }[] = [];
|
||||||
|
document.querySelectorAll("a[href]").forEach((el: Element) => {
|
||||||
|
const href = (el.getAttribute("href") ?? "").trim();
|
||||||
|
if (!href) return;
|
||||||
|
links.push({
|
||||||
|
href,
|
||||||
|
text: ((el as unknown as { textContent?: string }).textContent ?? "")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim(),
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
// Newsletters frequently defer images via data-src/loading="lazy"; readers don't
|
// Newsletters frequently defer images via data-src/loading="lazy"; readers don't
|
||||||
// run the lazy-loader, so the image renders blank. Promote the real source.
|
// run the lazy-loader, so the image renders blank. Promote the real source.
|
||||||
function promoteLazyImages(document: ParsedDocument): void {
|
function promoteLazyImages(document: ParsedDocument): void {
|
||||||
|
|||||||
Reference in New Issue
Block a user