mirror of
https://github.com/juherr/kill-the-news.git
synced 2026-06-20 22:03:48 +00:00
feat(infra): extract rel=alternate feed links from email HTML
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import {
|
|||||||
extractInlineCids,
|
extractInlineCids,
|
||||||
htmlToText,
|
htmlToText,
|
||||||
extractLinks,
|
extractLinks,
|
||||||
|
extractFeedLinks,
|
||||||
} from "./html-processor";
|
} from "./html-processor";
|
||||||
import type { AttachmentData } from "../types";
|
import type { AttachmentData } from "../types";
|
||||||
|
|
||||||
@@ -348,3 +349,49 @@ describe("extractLinks", () => {
|
|||||||
expect(extractLinks("")).toEqual([]);
|
expect(extractLinks("")).toEqual([]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("extractFeedLinks", () => {
|
||||||
|
it("extracts rel=alternate links that carry a type", () => {
|
||||||
|
const html = `<html><head>
|
||||||
|
<link rel="alternate" type="application/rss+xml" href="https://blog.example.com/feed.xml">
|
||||||
|
<link rel="alternate" type="application/atom+xml" href="https://blog.example.com/atom.xml">
|
||||||
|
</head><body>hi</body></html>`;
|
||||||
|
expect(extractFeedLinks(html)).toEqual([
|
||||||
|
{
|
||||||
|
href: "https://blog.example.com/feed.xml",
|
||||||
|
type: "application/rss+xml",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
href: "https://blog.example.com/atom.xml",
|
||||||
|
type: "application/atom+xml",
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("ignores non-alternate rels and links without a type", () => {
|
||||||
|
const html = `<head>
|
||||||
|
<link rel="stylesheet" type="text/css" href="https://x.com/a.css">
|
||||||
|
<link rel="alternate" href="https://x.com/notype">
|
||||||
|
</head>`;
|
||||||
|
expect(extractFeedLinks(html)).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("absolutizes a relative href against the base", () => {
|
||||||
|
const html = `<head><link rel="alternate" type="application/rss+xml" href="/feed.xml"></head>`;
|
||||||
|
expect(extractFeedLinks(html, "https://blog.example.com")).toEqual([
|
||||||
|
{
|
||||||
|
href: "https://blog.example.com/feed.xml",
|
||||||
|
type: "application/rss+xml",
|
||||||
|
},
|
||||||
|
]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("drops a relative href when no base is given", () => {
|
||||||
|
const html = `<head><link rel="alternate" type="application/rss+xml" href="/feed.xml"></head>`;
|
||||||
|
expect(extractFeedLinks(html)).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns [] for plain-text bodies", () => {
|
||||||
|
expect(extractFeedLinks("just text https://x.com/feed")).toEqual([]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -69,6 +69,34 @@ export function extractLinks(
|
|||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Collect a newsletter's self-advertised feed declarations:
|
||||||
|
// <link rel="alternate" type="application/(atom|rss)+xml|feed+json" href="…">.
|
||||||
|
// Returns raw href+type tuples; the domain decides which MIME types are feeds.
|
||||||
|
// Relative hrefs are absolutized against the sender base (best-effort); only
|
||||||
|
// http(s) URLs survive. Plain-text bodies have no <link> → [].
|
||||||
|
export function extractFeedLinks(
|
||||||
|
content: string,
|
||||||
|
base = "",
|
||||||
|
): { href: string; type: string }[] {
|
||||||
|
if (!content || isPlainText(content)) return [];
|
||||||
|
|
||||||
|
const { document } = parseHTML(content);
|
||||||
|
const links: { href: string; type: string }[] = [];
|
||||||
|
document
|
||||||
|
.querySelectorAll('link[rel~="alternate"][type]')
|
||||||
|
.forEach((el: Element) => {
|
||||||
|
const type = (el.getAttribute("type") ?? "").trim();
|
||||||
|
const rawHref = (el.getAttribute("href") ?? "").trim();
|
||||||
|
if (!type || !rawHref) return;
|
||||||
|
const href = /^https?:\/\//i.test(rawHref)
|
||||||
|
? rawHref
|
||||||
|
: (toAbsolute(rawHref, base) ?? "");
|
||||||
|
if (!/^https?:\/\//i.test(href)) return;
|
||||||
|
links.push({ href, type });
|
||||||
|
});
|
||||||
|
return links;
|
||||||
|
}
|
||||||
|
|
||||||
// Newsletters frequently defer images via data-src/loading="lazy"; readers don't
|
// Newsletters frequently defer images via data-src/loading="lazy"; readers don't
|
||||||
// run the lazy-loader, so the image renders blank. Promote the real source.
|
// run the lazy-loader, so the image renders blank. Promote the real source.
|
||||||
function promoteLazyImages(document: ParsedDocument): void {
|
function promoteLazyImages(document: ParsedDocument): void {
|
||||||
|
|||||||
Reference in New Issue
Block a user