v3: fixed rich content extraction heuristics

2026-04-11 13:13:43 +00:00
parent b822f861ed
commit 34933a3d4f
6 changed files with 280 additions and 32 deletions
--- a/api/services/rich-content-service.ts
+++ b/api/services/rich-content-service.ts
@@ -26,25 +26,113 @@ const providers: RichContentProvider[] = [

 // Shared utilities exported for use by providers

+const FETCH_HEADERS = {
+  "User-Agent":
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+  "Accept":
+    "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+  "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
+};
+
+/**
+ * Fetch `url` via a `curl --insecure` subprocess and return a minimal Response.
+ * Used as a last resort when Deno's rustls rejects the server's TLS certificate
+ * due to an unsupported algorithm (e.g. UnsupportedSignatureAlgorithm).
+ * Returns null if curl is unavailable or exits non-zero.
+ */
+async function fetchViaCurl(
+  url: string,
+  timeoutMs: number,
+): Promise<Response | null> {
+  const tmpPath = await Deno.makeTempFile();
+  try {
+    const { code, stdout } = await new Deno.Command("curl", {
+      args: [
+        "--silent",
+        "--insecure",
+        "--location",
+        "--max-time",
+        String(Math.ceil(timeoutMs / 1000)),
+        "--user-agent",
+        FETCH_HEADERS["User-Agent"],
+        "--header",
+        `Accept: ${FETCH_HEADERS["Accept"]}`,
+        "--header",
+        `Accept-Language: ${FETCH_HEADERS["Accept-Language"]}`,
+        "--output",
+        tmpPath,
+        "--write-out",
+        "%{content_type}",
+        url,
+      ],
+      stdout: "piped",
+      stderr: "null",
+    }).output();
+
+    if (code !== 0) return null;
+
+    const contentType = new TextDecoder().decode(stdout).trim();
+    const bytes = await Deno.readFile(tmpPath);
+    // Decode using the charset declared in the Content-Type header so that
+    // pages served in ISO-8859-1, windows-1252, etc. are read correctly.
+    const charset = /charset=([\w-]+)/i.exec(contentType)?.[1] ?? "utf-8";
+    let bodyText: string;
+    try {
+      bodyText = new TextDecoder(charset, { fatal: false }).decode(bytes);
+    } catch {
+      bodyText = new TextDecoder("utf-8", { fatal: false }).decode(bytes);
+    }
+    return new Response(bodyText, { headers: { "content-type": "text/html" } });
+  } catch {
+    return null;
+  } finally {
+    await Deno.remove(tmpPath).catch(() => {});
+  }
+}
+
 export async function fetchWithTimeout(
  url: string,
  timeoutMs = 5000,
 ): Promise<Response> {
-  const controller = new AbortController();
-  const timer = setTimeout(() => controller.abort(), timeoutMs);
+  async function attempt(
+    extraInit?: Record<string, unknown>,
+  ): Promise<Response> {
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), timeoutMs);
+    try {
+      return await fetch(url, {
+        signal: controller.signal,
+        headers: FETCH_HEADERS,
+        ...extraInit,
+      } as RequestInit);
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+
  try {
-    return await fetch(url, {
-      signal: controller.signal,
-      headers: {
-        "User-Agent":
-          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
-        "Accept":
-          "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-        "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
-      },
-    });
-  } finally {
-    clearTimeout(timer);
+    return await attempt();
+  } catch (err) {
+    if (!(err instanceof TypeError && err.message.includes("certificate"))) {
+      throw err;
+    }
+
+    // Retry 1: allowInsecureCertificates handles expired / self-signed certs.
+    const client = Deno.createHttpClient({ allowInsecureCertificates: true });
+    try {
+      return await attempt({ client });
+    } catch {
+      /* UnsupportedSignatureAlgorithm etc. — rustls can't help */
+    } finally {
+      client.close();
+    }
+
+    // Retry 2: curl uses its own TLS stack and supports a wider set of
+    // certificate algorithms that Deno/rustls rejects.
+    const curlRes = await fetchViaCurl(url, timeoutMs);
+    if (curlRes) return curlRes;
+
+    throw err;
  }
 }

@@ -243,6 +331,66 @@ export function extractBestIcon(
  return candidates[0].href;
 }

+/**
+ * Return the `src` of the first `<img>` that looks like content rather than UI
+ * chrome. Skips SVGs, data URIs, and images whose filename matches common
+ * icon/logo/nav patterns (logo, icon, sprite, favicon, avatar, banner, etc.).
+ * Resolves relative and protocol-relative URLs.
+ */
+const UI_IMAGE_KEYWORDS = new Set([
+  "logo",
+  "icon",
+  "sprite",
+  "favicon",
+  "avatar",
+  "banner",
+  "header",
+  "nav",
+  "menu",
+  "cart",
+  "search",
+  "tracking",
+  "pixel",
+  "bg",
+  "background",
+]);
+
+function isUiImage(src: string): boolean {
+  if (/\.svg(\?|$)/i.test(src)) return true;
+  const filename = src.split("?")[0].split("/").pop() ?? "";
+  const baseName = filename.replace(/\.[^.]+$/, ""); // strip extension
+  // Split on common filename separators (-, _, .) and check each token
+  return baseName.toLowerCase().split(/[-_.]/).some((t) =>
+    UI_IMAGE_KEYWORDS.has(t)
+  );
+}
+
+/**
+ * Return the `src` of the first `<img>` that looks like content rather than UI
+ * chrome. Skips SVGs, data URIs, and images whose filename tokens match common
+ * icon/logo/nav patterns (logo, icon, sprite, etc.).
+ * Resolves relative and protocol-relative URLs.
+ */
+export function extractFirstContentImage(
+  html: string,
+  baseUrl: string,
+): string | undefined {
+  const imgPattern = /<img[^>]+>/gi;
+  let match: RegExpExecArray | null;
+  while ((match = imgPattern.exec(html)) !== null) {
+    const tag = match[0];
+    const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
+    if (!src || src.startsWith("data:")) continue;
+    if (isUiImage(src)) continue;
+    try {
+      return new URL(src, baseUrl).toString();
+    } catch {
+      continue;
+    }
+  }
+  return undefined;
+}
+
 /**
 * Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`,
 * resolved to an absolute URL using `baseUrl`.
@@ -307,6 +455,16 @@ export async function fetchRichContent(
    return await provider.fetch(url);
  } catch (err) {
    console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err);
-    return undefined;
+    // Return a minimal stub so the caller always gets something displayable
+    // (e.g. when the site has a bad TLS cert or the fetch times out).
+    try {
+      return {
+        type: "generic",
+        url,
+        siteName: new URL(url).hostname.replace(/^www\./, ""),
+      };
+    } catch {
+      return undefined;
+    }
  }
 }