v3: fixed rich content extraction heuristics
All checks were successful
Build and Publish Docker Image / build-and-push (push) Successful in 3m15s
All checks were successful
Build and Publish Docker Image / build-and-push (push) Successful in 3m15s
This commit is contained in:
@@ -2,6 +2,7 @@ import type { RichContent } from "../../model/interfaces.ts";
|
||||
import type { RichContentProvider } from "../rich-content-service.ts";
|
||||
import {
|
||||
extractBestIcon,
|
||||
extractFirstContentImage,
|
||||
extractJsonLd,
|
||||
extractLargeImage,
|
||||
extractMetaName,
|
||||
@@ -28,30 +29,45 @@ export const genericProvider: RichContentProvider = {
|
||||
const html = await res.text();
|
||||
const ld = extractJsonLd(html);
|
||||
|
||||
// Title: og:title → twitter:title → JSON-LD → <title>
|
||||
const title = extractOgTag(html, "title") ??
|
||||
// If og:url is present but points to a different page (e.g. the homepage),
|
||||
// the og: block is a site-level fallback, not page-specific metadata.
|
||||
// In that case skip og:title and og:image so page-level signals win.
|
||||
const ogUrl = extractOgTag(html, "url");
|
||||
const useOg = !ogUrl || (() => {
|
||||
try {
|
||||
const ogPath = new URL(ogUrl).pathname.replace(/\/+$/, "") || "/";
|
||||
const pagePath = new URL(url).pathname.replace(/\/+$/, "") || "/";
|
||||
return ogPath === pagePath;
|
||||
} catch {
|
||||
return true;
|
||||
}
|
||||
})();
|
||||
|
||||
// Title: og:title (page-matched) → twitter:title → JSON-LD → <title>
|
||||
const title = (useOg ? extractOgTag(html, "title") : undefined) ??
|
||||
extractMetaName(html, "twitter:title") ??
|
||||
ld.title ??
|
||||
extractPageTitle(html);
|
||||
|
||||
// Site name: og:site_name → hostname
|
||||
const siteName = extractOgTag(html, "site_name") ??
|
||||
new URL(url).hostname.replace(/^www\./, "");
|
||||
|
||||
// Description: og:description → twitter:description → JSON-LD → <meta name="description">
|
||||
const description = extractOgTag(html, "description") ??
|
||||
extractMetaName(html, "twitter:description") ??
|
||||
ld.description ??
|
||||
extractMetaName(html, "description");
|
||||
|
||||
// Image: og:image → twitter:image → JSON-LD → first large <img> → best icon → /favicon.ico
|
||||
const thumbnailUrl = extractOgTag(html, "image") ??
|
||||
// Image: og:image (page-matched) → twitter:image → JSON-LD → large <img> → first content <img> → best icon → /favicon.ico
|
||||
const thumbnailUrl = (useOg ? extractOgTag(html, "image") : undefined) ??
|
||||
extractMetaName(html, "twitter:image") ??
|
||||
ld.thumbnailUrl ??
|
||||
extractLargeImage(html, url) ??
|
||||
extractFirstContentImage(html, url) ??
|
||||
extractBestIcon(html, url) ??
|
||||
`${new URL(url).origin}/favicon.ico`;
|
||||
|
||||
// Site name: og:site_name → hostname
|
||||
const siteName = extractOgTag(html, "site_name") ??
|
||||
new URL(url).hostname.replace(/^www\./, "");
|
||||
|
||||
return {
|
||||
type: "generic",
|
||||
url,
|
||||
|
||||
Reference in New Issue
Block a user