v3: fixed rich content extraction heuristics
All checks were successful
Build and Publish Docker Image / build-and-push (push) Successful in 3m15s

This commit is contained in:
khannurien
2026-04-11 13:13:43 +00:00
parent b822f861ed
commit 34933a3d4f
6 changed files with 280 additions and 32 deletions

View File

@@ -2,6 +2,7 @@ import type { RichContent } from "../../model/interfaces.ts";
import type { RichContentProvider } from "../rich-content-service.ts";
import {
extractBestIcon,
extractFirstContentImage,
extractJsonLd,
extractLargeImage,
extractMetaName,
@@ -28,30 +29,45 @@ export const genericProvider: RichContentProvider = {
const html = await res.text();
const ld = extractJsonLd(html);
// Title: og:title → twitter:title → JSON-LD → <title>
const title = extractOgTag(html, "title") ??
// If og:url is present but points to a different page (e.g. the homepage),
// the og: block is a site-level fallback, not page-specific metadata.
// In that case skip og:title and og:image so page-level signals win.
const ogUrl = extractOgTag(html, "url");
const useOg = !ogUrl || (() => {
try {
const ogPath = new URL(ogUrl).pathname.replace(/\/+$/, "") || "/";
const pagePath = new URL(url).pathname.replace(/\/+$/, "") || "/";
return ogPath === pagePath;
} catch {
return true;
}
})();
// Title: og:title (page-matched) → twitter:title → JSON-LD → <title>
const title = (useOg ? extractOgTag(html, "title") : undefined) ??
extractMetaName(html, "twitter:title") ??
ld.title ??
extractPageTitle(html);
// Site name: og:site_name → hostname
const siteName = extractOgTag(html, "site_name") ??
new URL(url).hostname.replace(/^www\./, "");
// Description: og:description → twitter:description → JSON-LD → <meta name="description">
const description = extractOgTag(html, "description") ??
extractMetaName(html, "twitter:description") ??
ld.description ??
extractMetaName(html, "description");
// Image: og:image → twitter:image → JSON-LD → first large <img> → best icon → /favicon.ico
const thumbnailUrl = extractOgTag(html, "image") ??
// Image: og:image (page-matched) → twitter:image → JSON-LD → large <img> → first content <img> → best icon → /favicon.ico
const thumbnailUrl = (useOg ? extractOgTag(html, "image") : undefined) ??
extractMetaName(html, "twitter:image") ??
ld.thumbnailUrl ??
extractLargeImage(html, url) ??
extractFirstContentImage(html, url) ??
extractBestIcon(html, url) ??
`${new URL(url).origin}/favicon.ico`;
// Site name: og:site_name → hostname
const siteName = extractOgTag(html, "site_name") ??
new URL(url).hostname.replace(/^www\./, "");
return {
type: "generic",
url,