From 3c48181c0545f99c7688f6a0f8728a2ac395e99e Mon Sep 17 00:00:00 2001 From: Julien Herr Date: Mon, 25 May 2026 17:08:48 +0200 Subject: [PATCH] feat(infra): extract rel=alternate feed links from email HTML Co-Authored-By: Claude Sonnet 4.6 --- src/infrastructure/html-processor.test.ts | 47 +++++++++++++++++++++++ src/infrastructure/html-processor.ts | 28 ++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/src/infrastructure/html-processor.test.ts b/src/infrastructure/html-processor.test.ts index 1c3d7ec..7a548f4 100644 --- a/src/infrastructure/html-processor.test.ts +++ b/src/infrastructure/html-processor.test.ts @@ -4,6 +4,7 @@ import { extractInlineCids, htmlToText, extractLinks, + extractFeedLinks, } from "./html-processor"; import type { AttachmentData } from "../types"; @@ -348,3 +349,49 @@ describe("extractLinks", () => { expect(extractLinks("")).toEqual([]); }); }); + +describe("extractFeedLinks", () => { + it("extracts rel=alternate links that carry a type", () => { + const html = ` + + + hi`; + expect(extractFeedLinks(html)).toEqual([ + { + href: "https://blog.example.com/feed.xml", + type: "application/rss+xml", + }, + { + href: "https://blog.example.com/atom.xml", + type: "application/atom+xml", + }, + ]); + }); + + it("ignores non-alternate rels and links without a type", () => { + const html = ` + + + `; + expect(extractFeedLinks(html)).toEqual([]); + }); + + it("absolutizes a relative href against the base", () => { + const html = ``; + expect(extractFeedLinks(html, "https://blog.example.com")).toEqual([ + { + href: "https://blog.example.com/feed.xml", + type: "application/rss+xml", + }, + ]); + }); + + it("drops a relative href when no base is given", () => { + const html = ``; + expect(extractFeedLinks(html)).toEqual([]); + }); + + it("returns [] for plain-text bodies", () => { + expect(extractFeedLinks("just text https://x.com/feed")).toEqual([]); + }); +}); diff --git a/src/infrastructure/html-processor.ts b/src/infrastructure/html-processor.ts index 6b84803..5fb420d 100644 --- a/src/infrastructure/html-processor.ts +++ b/src/infrastructure/html-processor.ts @@ -69,6 +69,34 @@ export function extractLinks( return links; } +// Collect a newsletter's self-advertised feed declarations: +// . +// Returns raw href+type tuples; the domain decides which MIME types are feeds. +// Relative hrefs are absolutized against the sender base (best-effort); only +// http(s) URLs survive. Plain-text bodies have no → []. +export function extractFeedLinks( + content: string, + base = "", +): { href: string; type: string }[] { + if (!content || isPlainText(content)) return []; + + const { document } = parseHTML(content); + const links: { href: string; type: string }[] = []; + document + .querySelectorAll('link[rel~="alternate"][type]') + .forEach((el: Element) => { + const type = (el.getAttribute("type") ?? "").trim(); + const rawHref = (el.getAttribute("href") ?? "").trim(); + if (!type || !rawHref) return; + const href = /^https?:\/\//i.test(rawHref) + ? rawHref + : (toAbsolute(rawHref, base) ?? ""); + if (!/^https?:\/\//i.test(href)) return; + links.push({ href, type }); + }); + return links; +} + // Newsletters frequently defer images via data-src/loading="lazy"; readers don't // run the lazy-loader, so the image renders blank. Promote the real source. function promoteLazyImages(document: ParsedDocument): void {