From a29e9ab3723801c3a087e63069aeb2807aa6cdfc Mon Sep 17 00:00:00 2001 From: Julien Herr Date: Fri, 22 May 2026 21:12:10 +0200 Subject: [PATCH] feat: WebSub Atom support, HTML processing via linkedom, W3C badges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WebSub / PubSubHubbub: - Hub now accepts both /rss/:id and /atom/:id topic URLs - WebSubSubscription stores format ("rss" | "atom") - notifySubscribers sends RSS or Atom XML with correct Content-Type - verifyAndStoreSubscription sends correct topic URL per format - CI paths-ignore docs/** to skip deploy on docs-only changes HTML processing (linkedom + escape-html): - New html-processor.ts: body extraction, script/iframe/object removal, event handler + javascript: URL stripping, mso-* style cleanup, plain text →
 with HTML escaping via escape-html
- feed-generator.ts and entries.ts use processEmailContent

Admin UI:
- W3C validation badges (Atom + RSS) on feed detail page

Co-Authored-By: Claude Sonnet 4.5 
---
 .github/workflows/ci.yml         |   4 +
 package-lock.json                | 191 ++++++++++++++++++++++++++++++-
 package.json                     |   3 +
 src/routes/admin/emails.tsx      |  24 ++++
 src/routes/entries.ts            |   5 +-
 src/routes/hub.test.ts           |  84 +++++++++++++-
 src/routes/hub.ts                |  10 +-
 src/styles/components.css        |  11 ++
 src/utils/feed-generator.ts      |  28 +----
 src/utils/html-processor.test.ts | 125 ++++++++++++++++++++
 src/utils/html-processor.ts      |  73 ++++++++++++
 src/utils/websub.test.ts         | 120 +++++++++++++++++++
 src/utils/websub.ts              | 110 ++++++++++++------
 13 files changed, 719 insertions(+), 69 deletions(-)
 create mode 100644 src/utils/html-processor.test.ts
 create mode 100644 src/utils/html-processor.ts

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 244ea0a..2125f4d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,8 +3,12 @@ name: CI
 on:
   push:
     branches: ["main"]
+    paths-ignore:
+      - "docs/**"
   pull_request:
     branches: ["main"]
+    paths-ignore:
+      - "docs/**"
 
 jobs:
   ci:
diff --git a/package-lock.json b/package-lock.json
index a703046..f2d2a27 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -10,13 +10,16 @@
       "license": "MIT",
       "dependencies": {
         "@hono/zod-validator": "^0.8.0",
+        "escape-html": "^1.0.3",
         "feed": "5.2.1",
         "hono": "4.12.22",
+        "linkedom": "^0.18.12",
         "postal-mime": "^2.7.4",
         "zod": "4.4.3"
       },
       "devDependencies": {
         "@cloudflare/workers-types": "4.20260522.1",
+        "@types/escape-html": "^1.0.4",
         "@types/mailparser": "^3.4.6",
         "@types/rss": "^0.0.32",
         "@vitest/coverage-v8": "4.1.7",
@@ -2007,6 +2010,13 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/@types/escape-html": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/@types/escape-html/-/escape-html-1.0.4.tgz",
+      "integrity": "sha512-qZ72SFTgUAZ5a7Tj6kf2SHLetiH5S6f8G5frB2SPQ3EyF02kxdyBFf4Tz4banE3xCgGnKgWLt//a6VuYHKYJTg==",
+      "dev": true,
+      "license": "MIT"
+    },
     "node_modules/@types/esrecurse": {
       "version": "4.3.1",
       "resolved": "https://registry.npmjs.org/@types/esrecurse/-/esrecurse-4.3.1.tgz",
@@ -2566,6 +2576,12 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/boolbase": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
+      "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==",
+      "license": "ISC"
+    },
     "node_modules/brace-expansion": {
       "version": "5.0.6",
       "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.6.tgz",
@@ -2749,6 +2765,40 @@
         "node": ">= 8"
       }
     },
+    "node_modules/css-select": {
+      "version": "5.2.2",
+      "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz",
+      "integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==",
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "boolbase": "^1.0.0",
+        "css-what": "^6.1.0",
+        "domhandler": "^5.0.2",
+        "domutils": "^3.0.1",
+        "nth-check": "^2.0.1"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/fb55"
+      }
+    },
+    "node_modules/css-what": {
+      "version": "6.2.2",
+      "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz",
+      "integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==",
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">= 6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/fb55"
+      }
+    },
+    "node_modules/cssom": {
+      "version": "0.5.0",
+      "resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz",
+      "integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==",
+      "license": "MIT"
+    },
     "node_modules/debug": {
       "version": "4.4.3",
       "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
@@ -2784,6 +2834,73 @@
         "node": ">=8"
       }
     },
+    "node_modules/dom-serializer": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
+      "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
+      "license": "MIT",
+      "dependencies": {
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.2",
+        "entities": "^4.2.0"
+      },
+      "funding": {
+        "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
+      }
+    },
+    "node_modules/dom-serializer/node_modules/entities": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+      "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=0.12"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/entities?sponsor=1"
+      }
+    },
+    "node_modules/domelementtype": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
+      "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/fb55"
+        }
+      ],
+      "license": "BSD-2-Clause"
+    },
+    "node_modules/domhandler": {
+      "version": "5.0.3",
+      "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
+      "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "domelementtype": "^2.3.0"
+      },
+      "engines": {
+        "node": ">= 4"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/domhandler?sponsor=1"
+      }
+    },
+    "node_modules/domutils": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
+      "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "dom-serializer": "^2.0.0",
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.3"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/domutils?sponsor=1"
+      }
+    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -2795,7 +2912,6 @@
       "version": "7.0.1",
       "resolved": "https://registry.npmjs.org/entities/-/entities-7.0.1.tgz",
       "integrity": "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA==",
-      "dev": true,
       "license": "BSD-2-Clause",
       "engines": {
         "node": ">=0.12"
@@ -2886,6 +3002,12 @@
         "node": ">=6"
       }
     },
+    "node_modules/escape-html": {
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
+      "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==",
+      "license": "MIT"
+    },
     "node_modules/escape-string-regexp": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
@@ -3340,6 +3462,25 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/htmlparser2": {
+      "version": "10.1.0",
+      "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.1.0.tgz",
+      "integrity": "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ==",
+      "funding": [
+        "https://github.com/fb55/htmlparser2?sponsor=1",
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/fb55"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "domelementtype": "^2.3.0",
+        "domhandler": "^5.0.3",
+        "domutils": "^3.2.2",
+        "entities": "^7.0.1"
+      }
+    },
     "node_modules/husky": {
       "version": "9.1.7",
       "resolved": "https://registry.npmjs.org/husky/-/husky-9.1.7.tgz",
@@ -3810,6 +3951,36 @@
         "url": "https://opencollective.com/parcel"
       }
     },
+    "node_modules/linkedom": {
+      "version": "0.18.12",
+      "resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.12.tgz",
+      "integrity": "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==",
+      "license": "ISC",
+      "dependencies": {
+        "css-select": "^5.1.0",
+        "cssom": "^0.5.0",
+        "html-escaper": "^3.0.3",
+        "htmlparser2": "^10.0.0",
+        "uhyphen": "^0.2.0"
+      },
+      "engines": {
+        "node": ">=16"
+      },
+      "peerDependencies": {
+        "canvas": ">= 2"
+      },
+      "peerDependenciesMeta": {
+        "canvas": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/linkedom/node_modules/html-escaper": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz",
+      "integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==",
+      "license": "MIT"
+    },
     "node_modules/lint-staged": {
       "version": "17.0.5",
       "resolved": "https://registry.npmjs.org/lint-staged/-/lint-staged-17.0.5.tgz",
@@ -4249,6 +4420,18 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/nth-check": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
+      "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
+      "license": "BSD-2-Clause",
+      "dependencies": {
+        "boolbase": "^1.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/fb55/nth-check?sponsor=1"
+      }
+    },
     "node_modules/obug": {
       "version": "2.1.1",
       "resolved": "https://registry.npmjs.org/obug/-/obug-2.1.1.tgz",
@@ -5017,6 +5200,12 @@
         "typescript": ">=4.8.4 <6.1.0"
       }
     },
+    "node_modules/uhyphen": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz",
+      "integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==",
+      "license": "ISC"
+    },
     "node_modules/undici": {
       "version": "7.24.8",
       "resolved": "https://registry.npmjs.org/undici/-/undici-7.24.8.tgz",
diff --git a/package.json b/package.json
index 1eb6406..17367b0 100644
--- a/package.json
+++ b/package.json
@@ -32,6 +32,7 @@
   "license": "MIT",
   "devDependencies": {
     "@cloudflare/workers-types": "4.20260522.1",
+    "@types/escape-html": "^1.0.4",
     "@types/mailparser": "^3.4.6",
     "@types/rss": "^0.0.32",
     "@vitest/coverage-v8": "4.1.7",
@@ -50,8 +51,10 @@
   },
   "dependencies": {
     "@hono/zod-validator": "^0.8.0",
+    "escape-html": "^1.0.3",
     "feed": "5.2.1",
     "hono": "4.12.22",
+    "linkedom": "^0.18.12",
     "postal-mime": "^2.7.4",
     "zod": "4.4.3"
   }
diff --git a/src/routes/admin/emails.tsx b/src/routes/admin/emails.tsx
index f182d74..eca9ae0 100644
--- a/src/routes/admin/emails.tsx
+++ b/src/routes/admin/emails.tsx
@@ -116,6 +116,30 @@ emailsRouter.get("/feeds/:feedId/emails", async (c) => {
             
             
           
+          
         
 
         

diff --git a/src/routes/entries.ts b/src/routes/entries.ts index 2a662d2..a8f7618 100644 --- a/src/routes/entries.ts +++ b/src/routes/entries.ts @@ -1,6 +1,7 @@ import { Context } from "hono"; import { html, raw } from "hono/html"; import { Env, FeedMetadata, EmailData } from "../types"; +import { processEmailContent } from "../utils/html-processor"; export async function handle(c: Context<{ Bindings: Env }>): Promise { const feedId = c.req.param("feedId"); @@ -82,7 +83,9 @@ export async function handle(c: Context<{ Bindings: Env }>): Promise {
Date:
${new Date(emailData.receivedAt).toUTCString()}
-
${raw(emailData.content)}
+
+ ${raw(processEmailContent(emailData.content))} +
`, ); diff --git a/src/routes/hub.test.ts b/src/routes/hub.test.ts index 2cc3855..0af453d 100644 --- a/src/routes/hub.test.ts +++ b/src/routes/hub.test.ts @@ -124,6 +124,21 @@ describe("POST /hub — input validation", () => { expect(res.status).toBe(400); }); + it("returns 400 when hub.topic uses an unsupported path (not rss or atom)", async () => { + const app = makeApp(); + const env = createMockEnv(); + const res = await app.request( + "/hub", + hubBody({ + "hub.mode": "subscribe", + "hub.topic": `https://${env.DOMAIN}/feed/feed1`, + "hub.callback": "https://cb.example/sub", + }), + env, + ); + expect(res.status).toBe(400); + }); + it("returns 400 when hub.secret exceeds 200 bytes", async () => { const app = makeApp(); const env = createMockEnv(); @@ -213,10 +228,51 @@ describe("POST /hub — subscribe", () => { ); expect(res.status).toBe(404); }); + + it("returns 202 for valid Atom subscribe request", async () => { + const app = makeApp(); + const env = createMockEnv(); + await env.EMAIL_STORAGE.put( + "feed:feed1:config", + JSON.stringify({ title: "Feed 1" }), + ); + server.use( + http.get("https://cb.example/sub", ({ request }) => { + const challenge = + new URL(request.url).searchParams.get("hub.challenge") ?? ""; + return HttpResponse.text(challenge); + }), + ); + const res = await app.request( + "/hub", + hubBody({ + "hub.mode": "subscribe", + "hub.topic": `https://${env.DOMAIN}/atom/feed1`, + "hub.callback": "https://cb.example/sub", + }), + env, + ); + expect(res.status).toBe(202); + }); + + it("returns 404 for Atom topic when feed does not exist", async () => { + const app = makeApp(); + const env = createMockEnv(); + const res = await app.request( + "/hub", + hubBody({ + "hub.mode": "subscribe", + "hub.topic": `https://${env.DOMAIN}/atom/nonexistent`, + "hub.callback": "https://cb.example/sub", + }), + env, + ); + expect(res.status).toBe(404); + }); }); describe("POST /hub — unsubscribe", () => { - it("returns 202 for valid unsubscribe request", async () => { + it("returns 202 for valid RSS unsubscribe request", async () => { const app = makeApp(); const env = createMockEnv(); await env.EMAIL_STORAGE.put( @@ -241,4 +297,30 @@ describe("POST /hub — unsubscribe", () => { ); expect(res.status).toBe(202); }); + + it("returns 202 for valid Atom unsubscribe request", async () => { + const app = makeApp(); + const env = createMockEnv(); + await env.EMAIL_STORAGE.put( + "feed:feed1:config", + JSON.stringify({ title: "Feed 1" }), + ); + server.use( + http.get("https://cb.example/sub", ({ request }) => { + const challenge = + new URL(request.url).searchParams.get("hub.challenge") ?? ""; + return HttpResponse.text(challenge); + }), + ); + const res = await app.request( + "/hub", + hubBody({ + "hub.mode": "unsubscribe", + "hub.topic": `https://${env.DOMAIN}/atom/feed1`, + "hub.callback": "https://cb.example/sub", + }), + env, + ); + expect(res.status).toBe(202); + }); }); diff --git a/src/routes/hub.ts b/src/routes/hub.ts index c8924d2..461c300 100644 --- a/src/routes/hub.ts +++ b/src/routes/hub.ts @@ -59,18 +59,19 @@ hubRouter.post("/", async (c) => { return c.text("Bad Request: hub.callback must use HTTPS", 400); } - // Validate that topic matches a known RSS feed on this hub + // Validate that topic matches a known RSS or Atom feed on this hub const topicPattern = new RegExp( - `^https://${env.DOMAIN.replaceAll(".", "\\.")}/rss/([^/]+)$`, + `^https://${env.DOMAIN.replaceAll(".", "\\.")}/(rss|atom)/([^/]+)$`, ); const match = topic.match(topicPattern); if (!match) { return c.text( - "Bad Request: hub.topic must be an RSS feed URL on this hub", + "Bad Request: hub.topic must be an RSS or Atom feed URL on this hub", 400, ); } - const feedId = match[1]; + const format = match[1] as "rss" | "atom"; + const feedId = match[2]; // Verify the feed exists before accepting any subscription const feedConfig = await env.EMAIL_STORAGE.get( @@ -99,6 +100,7 @@ hubRouter.post("/", async (c) => { callbackUrl as string, secret as string | undefined, leaseSeconds, + format, env, ), ); diff --git a/src/styles/components.css b/src/styles/components.css index c331b75..df86ab4 100644 --- a/src/styles/components.css +++ b/src/styles/components.css @@ -954,6 +954,17 @@ table.table code { border-color: rgba(255, 69, 58, 0.35); } +/* Validation badges */ +.feed-validate { + display: flex; + gap: 0.5rem; + margin-top: 1rem; +} + +.feed-validate img { + display: block; +} + /* Feed and Email Lists */ .feed-list, .email-list { diff --git a/src/utils/feed-generator.ts b/src/utils/feed-generator.ts index 261b75e..74a22a5 100644 --- a/src/utils/feed-generator.ts +++ b/src/utils/feed-generator.ts @@ -1,5 +1,8 @@ import { Feed } from "feed"; import { FeedConfig, EmailData } from "../types"; +import { processEmailContent } from "./html-processor"; + +export { processEmailContent as extractBodyContent }; function parseFromAddress(from: string): { name: string; email?: string } { const match = from.match(/^(.*?)\s*<([^>]+)>\s*$/); @@ -13,29 +16,6 @@ function parseFromAddress(from: string): { name: string; email?: string } { return { name: from.trim() }; } -// Email content is stored as a full HTML document. Feed readers expect only -// the body fragment in /, not a full document. -export function extractBodyContent(html: string): string { - const withClose = html.match(/]*>([\s\S]*?)<\/body>/i); - const body = withClose - ? withClose[1] - : (() => { - const withoutClose = html.match(/]*>([\s\S]*)/i); - return withoutClose - ? withoutClose[1].replace(/<\/html>\s*$/i, "") - : html; - })(); - // Strip mso-* properties from inline styles (Office HTML — triggers feed validator warnings) - return body.replace(/\bstyle="([^"]*)"/gi, (_match, style: string) => { - const cleaned = style - .split(";") - .map((p) => p.trim()) - .filter((p) => p && !/^mso-/i.test(p)) - .join("; "); - return cleaned ? `style="${cleaned}"` : ""; - }); -} - function buildFeed( feedConfig: FeedConfig, emails: EmailData[], @@ -70,7 +50,7 @@ function buildFeed( for (const email of emails) { const entryUrl = `${baseUrl}/entries/${feedId}/${email.receivedAt}`; const firstAttachment = email.attachments?.[0]; - const bodyContent = extractBodyContent(email.content); + const bodyContent = processEmailContent(email.content); feed.addItem({ title: email.subject, id: entryUrl, diff --git a/src/utils/html-processor.test.ts b/src/utils/html-processor.test.ts new file mode 100644 index 0000000..e64b471 --- /dev/null +++ b/src/utils/html-processor.test.ts @@ -0,0 +1,125 @@ +import { describe, it, expect } from "vitest"; +import { processEmailContent } from "./html-processor"; + +describe("processEmailContent — body extraction", () => { + it("extracts content inside tags", () => { + const html = "

Hello

"; + expect(processEmailContent(html)).toBe("

Hello

"); + }); + + it("handles body tag with attributes", () => { + const html = '

Hi

'; + expect(processEmailContent(html)).toBe("

Hi

"); + }); + + it("returns fragment unchanged when no body tags present", () => { + const fragment = "

Already a fragment

"; + expect(processEmailContent(fragment)).toBe("

Already a fragment

"); + }); + + it("is case-insensitive for body tag matching", () => { + const html = "

content

"; + expect(processEmailContent(html)).toBe("

content

"); + }); +}); + +describe("processEmailContent — plain text", () => { + it("wraps plain text in
", () => {
+    const text = "Hello world\nSecond line";
+    const result = processEmailContent(text);
+    expect(result).toMatch(/^
 in plain text", () => {
+    const text = "Price < 10 & size > 5";
+    const result = processEmailContent(text);
+    expect(result).toContain("<");
+    expect(result).toContain(">");
+    expect(result).toContain("&");
+    expect(result).not.toContain("<10");
+  });
+
+  it("returns empty string for empty input", () => {
+    expect(processEmailContent("")).toBe("");
+  });
+});
+
+describe("processEmailContent — dangerous element removal", () => {
+  it("removes ";
+    const result = processEmailContent(html);
+    expect(result).not.toContain("Hello

"); + }); + + it("removes

ok

"; + const result = processEmailContent(html); + expect(result).not.toContain("ok

"); + }); + + it("removes and tags", () => { + const html = "

ok

"; + const result = processEmailContent(html); + expect(result).not.toContain(" { + it("removes event handler attributes", () => { + const html = + "link"; + const result = processEmailContent(html); + expect(result).not.toContain("onclick"); + expect(result).toContain('href="https://x.com"'); + }); + + it("removes onerror on images", () => { + const html = ""; + const result = processEmailContent(html); + expect(result).not.toContain("onerror"); + }); + + it("removes javascript: hrefs", () => { + const html = "click"; + const result = processEmailContent(html); + expect(result).not.toContain("javascript:"); + }); + + it("preserves legitimate href and src attributes", () => { + const html = + "link"; + const result = processEmailContent(html); + expect(result).toContain("https://example.com"); + }); +}); + +describe("processEmailContent — mso style cleanup", () => { + it("strips mso-* properties from inline styles", () => { + const html = + '

text

'; + const result = processEmailContent(html); + expect(result).not.toContain("mso-margin-top"); + expect(result).toContain("color: red"); + }); + + it("removes style attribute entirely when only mso properties remain", () => { + const html = + '

text

'; + const result = processEmailContent(html); + expect(result).not.toContain("style="); + }); + + it("preserves style attribute when non-mso properties remain", () => { + const html = + '

text

'; + const result = processEmailContent(html); + expect(result).toContain("font-weight"); + expect(result).not.toContain("mso-font-size"); + }); +}); diff --git a/src/utils/html-processor.ts b/src/utils/html-processor.ts new file mode 100644 index 0000000..c9d1dfd --- /dev/null +++ b/src/utils/html-processor.ts @@ -0,0 +1,73 @@ +import { parseHTML } from "linkedom"; +import escapeHtml from "escape-html"; + +function cleanMsoStyles(style: string): string { + return style + .split(";") + .map((p) => p.trim()) + .filter((p) => p && !/^mso-/i.test(p)) + .join("; "); +} + +function isPlainText(content: string): boolean { + return !/<[a-z][\s\S]*>/i.test(content); +} + +function sanitizeElement(el: Element): void { + // Snapshot attribute names before mutating (linkedom attributes is array-like) + const attrs = Array.from( + el.attributes as unknown as ArrayLike<{ name: string }>, + ).map((a) => a.name); + for (const attr of attrs) { + // Remove event handlers (onclick, onerror, onload, …) + if (/^on/i.test(attr)) { + el.removeAttribute(attr); + continue; + } + // Remove javascript: URLs + if (["href", "src", "action"].includes(attr.toLowerCase())) { + const val = el.getAttribute(attr) ?? ""; + if (/^\s*javascript:/i.test(val)) { + el.removeAttribute(attr); + continue; + } + } + } + // Strip mso-* inline style properties (Office HTML noise) + const style = el.getAttribute("style"); + if (style !== null) { + const cleaned = cleanMsoStyles(style); + if (cleaned) { + el.setAttribute("style", cleaned); + } else { + el.removeAttribute("style"); + } + } +} + +/** + * Processes email content for safe display in feeds and entry pages: + * - Detects plain text and wraps it in a
 block
+ * - Extracts the  fragment from full HTML documents
+ * - Removes dangerous elements: