v3: fixed rich content extraction heuristics

2026-04-11 13:13:43 +00:00
parent b822f861ed
commit 34933a3d4f
6 changed files with 280 additions and 32 deletions
--- a/api/services/providers/generic.ts
+++ b/api/services/providers/generic.ts
@@ -2,6 +2,7 @@ import type { RichContent } from "../../model/interfaces.ts";
 import type { RichContentProvider } from "../rich-content-service.ts";
 import {
  extractBestIcon,
+  extractFirstContentImage,
  extractJsonLd,
  extractLargeImage,
  extractMetaName,
@@ -28,30 +29,45 @@ export const genericProvider: RichContentProvider = {
    const html = await res.text();
    const ld = extractJsonLd(html);

-    // Title: og:title → twitter:title → JSON-LD → <title>
-    const title = extractOgTag(html, "title") ??
+    // If og:url is present but points to a different page (e.g. the homepage),
+    // the og: block is a site-level fallback, not page-specific metadata.
+    // In that case skip og:title and og:image so page-level signals win.
+    const ogUrl = extractOgTag(html, "url");
+    const useOg = !ogUrl || (() => {
+      try {
+        const ogPath = new URL(ogUrl).pathname.replace(/\/+$/, "") || "/";
+        const pagePath = new URL(url).pathname.replace(/\/+$/, "") || "/";
+        return ogPath === pagePath;
+      } catch {
+        return true;
+      }
+    })();
+
+    // Title: og:title (page-matched) → twitter:title → JSON-LD → <title>
+    const title = (useOg ? extractOgTag(html, "title") : undefined) ??
      extractMetaName(html, "twitter:title") ??
      ld.title ??
      extractPageTitle(html);

+    // Site name: og:site_name → hostname
+    const siteName = extractOgTag(html, "site_name") ??
+      new URL(url).hostname.replace(/^www\./, "");
+
    // Description: og:description → twitter:description → JSON-LD → <meta name="description">
    const description = extractOgTag(html, "description") ??
      extractMetaName(html, "twitter:description") ??
      ld.description ??
      extractMetaName(html, "description");

-    // Image: og:image → twitter:image → JSON-LD → first large <img> → best icon → /favicon.ico
-    const thumbnailUrl = extractOgTag(html, "image") ??
+    // Image: og:image (page-matched) → twitter:image → JSON-LD → large <img> → first content <img> → best icon → /favicon.ico
+    const thumbnailUrl = (useOg ? extractOgTag(html, "image") : undefined) ??
      extractMetaName(html, "twitter:image") ??
      ld.thumbnailUrl ??
      extractLargeImage(html, url) ??
+      extractFirstContentImage(html, url) ??
      extractBestIcon(html, url) ??
      `${new URL(url).origin}/favicon.ico`;

-    // Site name: og:site_name → hostname
-    const siteName = extractOgTag(html, "site_name") ??
-      new URL(url).hostname.replace(/^www\./, "");
-
    return {
      type: "generic",
      url,