From 34933a3d4fd12b6ecd1cbbfce22f4110ada1243d Mon Sep 17 00:00:00 2001 From: khannurien Date: Sat, 11 Apr 2026 13:13:43 +0000 Subject: [PATCH] v3: fixed rich content extraction heuristics --- Dockerfile | 2 +- api/routes/preview.ts | 41 ++++++ api/services/providers/generic.ts | 32 +++-- api/services/rich-content-service.ts | 188 ++++++++++++++++++++++++--- src/components/JournalCard.tsx | 26 +++- src/components/RichContentCard.tsx | 23 +++- 6 files changed, 280 insertions(+), 32 deletions(-) diff --git a/Dockerfile b/Dockerfile index 16490f8..76af45b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ RUN deno task build # ── Stage 2: runtime ────────────────────────────────────────────────────────── FROM denoland/deno:alpine-2.7.11 -RUN apk add --no-cache ffmpeg +RUN apk add --no-cache ffmpeg curl WORKDIR /app diff --git a/api/routes/preview.ts b/api/routes/preview.ts index 31619ff..9f6e6d3 100644 --- a/api/routes/preview.ts +++ b/api/routes/preview.ts @@ -1,6 +1,7 @@ import { Router } from "@oak/oak"; import { fetchRichContent, + fetchWithTimeout, isValidHttpUrl, } from "../services/rich-content-service.ts"; import { APIErrorCode } from "../model/interfaces.ts"; @@ -21,4 +22,44 @@ previewRouter.get("/api/preview", async (ctx) => { ctx.response.body = { success: true, data: data ?? null }; }); +/** + * Proxy an external image through the server so HTTP thumbnail URLs don't + * trigger mixed-content blocks when the frontend is served over HTTPS. + */ +previewRouter.get("/api/proxy-image", async (ctx) => { + const url = ctx.request.url.searchParams.get("url") ?? ""; + if (!isValidHttpUrl(url)) { + ctx.response.status = 400; + return; + } + + try { + const res = await fetchWithTimeout(url, 8000); + const contentType = res.headers.get("content-type") ?? ""; + if (!contentType.startsWith("image/")) { + ctx.response.status = 400; + return; + } + + const MAX_SIZE = 5 * 1024 * 1024; // 5 MB + const contentLength = Number(res.headers.get("content-length") ?? "0"); + if (contentLength > MAX_SIZE) { + ctx.response.status = 400; + return; + } + + const bytes = new Uint8Array(await res.arrayBuffer()); + if (bytes.length > MAX_SIZE) { + ctx.response.status = 400; + return; + } + + ctx.response.headers.set("Content-Type", contentType); + ctx.response.headers.set("Cache-Control", "public, max-age=86400"); + ctx.response.body = bytes; + } catch { + ctx.response.status = 502; + } +}); + export default previewRouter; diff --git a/api/services/providers/generic.ts b/api/services/providers/generic.ts index 3bced69..f3a00da 100644 --- a/api/services/providers/generic.ts +++ b/api/services/providers/generic.ts @@ -2,6 +2,7 @@ import type { RichContent } from "../../model/interfaces.ts"; import type { RichContentProvider } from "../rich-content-service.ts"; import { extractBestIcon, + extractFirstContentImage, extractJsonLd, extractLargeImage, extractMetaName, @@ -28,30 +29,45 @@ export const genericProvider: RichContentProvider = { const html = await res.text(); const ld = extractJsonLd(html); - // Title: og:title → twitter:title → JSON-LD → - const title = extractOgTag(html, "title") ?? + // If og:url is present but points to a different page (e.g. the homepage), + // the og: block is a site-level fallback, not page-specific metadata. + // In that case skip og:title and og:image so page-level signals win. + const ogUrl = extractOgTag(html, "url"); + const useOg = !ogUrl || (() => { + try { + const ogPath = new URL(ogUrl).pathname.replace(/\/+$/, "") || "/"; + const pagePath = new URL(url).pathname.replace(/\/+$/, "") || "/"; + return ogPath === pagePath; + } catch { + return true; + } + })(); + + // Title: og:title (page-matched) → twitter:title → JSON-LD → <title> + const title = (useOg ? extractOgTag(html, "title") : undefined) ?? extractMetaName(html, "twitter:title") ?? ld.title ?? extractPageTitle(html); + // Site name: og:site_name → hostname + const siteName = extractOgTag(html, "site_name") ?? + new URL(url).hostname.replace(/^www\./, ""); + // Description: og:description → twitter:description → JSON-LD → <meta name="description"> const description = extractOgTag(html, "description") ?? extractMetaName(html, "twitter:description") ?? ld.description ?? extractMetaName(html, "description"); - // Image: og:image → twitter:image → JSON-LD → first large <img> → best icon → /favicon.ico - const thumbnailUrl = extractOgTag(html, "image") ?? + // Image: og:image (page-matched) → twitter:image → JSON-LD → large <img> → first content <img> → best icon → /favicon.ico + const thumbnailUrl = (useOg ? extractOgTag(html, "image") : undefined) ?? extractMetaName(html, "twitter:image") ?? ld.thumbnailUrl ?? extractLargeImage(html, url) ?? + extractFirstContentImage(html, url) ?? extractBestIcon(html, url) ?? `${new URL(url).origin}/favicon.ico`; - // Site name: og:site_name → hostname - const siteName = extractOgTag(html, "site_name") ?? - new URL(url).hostname.replace(/^www\./, ""); - return { type: "generic", url, diff --git a/api/services/rich-content-service.ts b/api/services/rich-content-service.ts index 85f882b..8d37c22 100644 --- a/api/services/rich-content-service.ts +++ b/api/services/rich-content-service.ts @@ -26,25 +26,113 @@ const providers: RichContentProvider[] = [ // Shared utilities exported for use by providers +const FETCH_HEADERS = { + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", + "Accept": + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7", +}; + +/** + * Fetch `url` via a `curl --insecure` subprocess and return a minimal Response. + * Used as a last resort when Deno's rustls rejects the server's TLS certificate + * due to an unsupported algorithm (e.g. UnsupportedSignatureAlgorithm). + * Returns null if curl is unavailable or exits non-zero. + */ +async function fetchViaCurl( + url: string, + timeoutMs: number, +): Promise<Response | null> { + const tmpPath = await Deno.makeTempFile(); + try { + const { code, stdout } = await new Deno.Command("curl", { + args: [ + "--silent", + "--insecure", + "--location", + "--max-time", + String(Math.ceil(timeoutMs / 1000)), + "--user-agent", + FETCH_HEADERS["User-Agent"], + "--header", + `Accept: ${FETCH_HEADERS["Accept"]}`, + "--header", + `Accept-Language: ${FETCH_HEADERS["Accept-Language"]}`, + "--output", + tmpPath, + "--write-out", + "%{content_type}", + url, + ], + stdout: "piped", + stderr: "null", + }).output(); + + if (code !== 0) return null; + + const contentType = new TextDecoder().decode(stdout).trim(); + const bytes = await Deno.readFile(tmpPath); + // Decode using the charset declared in the Content-Type header so that + // pages served in ISO-8859-1, windows-1252, etc. are read correctly. + const charset = /charset=([\w-]+)/i.exec(contentType)?.[1] ?? "utf-8"; + let bodyText: string; + try { + bodyText = new TextDecoder(charset, { fatal: false }).decode(bytes); + } catch { + bodyText = new TextDecoder("utf-8", { fatal: false }).decode(bytes); + } + return new Response(bodyText, { headers: { "content-type": "text/html" } }); + } catch { + return null; + } finally { + await Deno.remove(tmpPath).catch(() => {}); + } +} + export async function fetchWithTimeout( url: string, timeoutMs = 5000, ): Promise<Response> { - const controller = new AbortController(); - const timer = setTimeout(() => controller.abort(), timeoutMs); + async function attempt( + extraInit?: Record<string, unknown>, + ): Promise<Response> { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + try { + return await fetch(url, { + signal: controller.signal, + headers: FETCH_HEADERS, + ...extraInit, + } as RequestInit); + } finally { + clearTimeout(timer); + } + } + try { - return await fetch(url, { - signal: controller.signal, - headers: { - "User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", - "Accept": - "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7", - }, - }); - } finally { - clearTimeout(timer); + return await attempt(); + } catch (err) { + if (!(err instanceof TypeError && err.message.includes("certificate"))) { + throw err; + } + + // Retry 1: allowInsecureCertificates handles expired / self-signed certs. + const client = Deno.createHttpClient({ allowInsecureCertificates: true }); + try { + return await attempt({ client }); + } catch { + /* UnsupportedSignatureAlgorithm etc. — rustls can't help */ + } finally { + client.close(); + } + + // Retry 2: curl uses its own TLS stack and supports a wider set of + // certificate algorithms that Deno/rustls rejects. + const curlRes = await fetchViaCurl(url, timeoutMs); + if (curlRes) return curlRes; + + throw err; } } @@ -243,6 +331,66 @@ export function extractBestIcon( return candidates[0].href; } +/** + * Return the `src` of the first `<img>` that looks like content rather than UI + * chrome. Skips SVGs, data URIs, and images whose filename matches common + * icon/logo/nav patterns (logo, icon, sprite, favicon, avatar, banner, etc.). + * Resolves relative and protocol-relative URLs. + */ +const UI_IMAGE_KEYWORDS = new Set([ + "logo", + "icon", + "sprite", + "favicon", + "avatar", + "banner", + "header", + "nav", + "menu", + "cart", + "search", + "tracking", + "pixel", + "bg", + "background", +]); + +function isUiImage(src: string): boolean { + if (/\.svg(\?|$)/i.test(src)) return true; + const filename = src.split("?")[0].split("/").pop() ?? ""; + const baseName = filename.replace(/\.[^.]+$/, ""); // strip extension + // Split on common filename separators (-, _, .) and check each token + return baseName.toLowerCase().split(/[-_.]/).some((t) => + UI_IMAGE_KEYWORDS.has(t) + ); +} + +/** + * Return the `src` of the first `<img>` that looks like content rather than UI + * chrome. Skips SVGs, data URIs, and images whose filename tokens match common + * icon/logo/nav patterns (logo, icon, sprite, etc.). + * Resolves relative and protocol-relative URLs. + */ +export function extractFirstContentImage( + html: string, + baseUrl: string, +): string | undefined { + const imgPattern = /<img[^>]+>/gi; + let match: RegExpExecArray | null; + while ((match = imgPattern.exec(html)) !== null) { + const tag = match[0]; + const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1]; + if (!src || src.startsWith("data:")) continue; + if (isUiImage(src)) continue; + try { + return new URL(src, baseUrl).toString(); + } catch { + continue; + } + } + return undefined; +} + /** * Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`, * resolved to an absolute URL using `baseUrl`. @@ -307,6 +455,16 @@ export async function fetchRichContent( return await provider.fetch(url); } catch (err) { console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err); - return undefined; + // Return a minimal stub so the caller always gets something displayable + // (e.g. when the site has a bad TLS cert or the fetch times out). + try { + return { + type: "generic", + url, + siteName: new URL(url).hostname.replace(/^www\./, ""), + }; + } catch { + return undefined; + } } } diff --git a/src/components/JournalCard.tsx b/src/components/JournalCard.tsx index 37a0dcb..05e44ca 100644 --- a/src/components/JournalCard.tsx +++ b/src/components/JournalCard.tsx @@ -38,10 +38,28 @@ export function JournalCard( navigate(dumpUrl(dump)); } - const thumbnailUrl = dump.kind === "file" && - dump.fileMime?.startsWith("image/") - ? `${API_URL}/api/files/${dump.id}?v=${dump.fileSize ?? 0}` - : (dump.richContent?.thumbnailUrl ?? null); + const rawThumbnail = + dump.kind === "file" && dump.fileMime?.startsWith("image/") + ? `${API_URL}/api/files/${dump.id}?v=${dump.fileSize ?? 0}` + : (dump.richContent?.thumbnailUrl ?? null); + + // Route external HTTP thumbnails through the server proxy to avoid + // mixed-content blocks when the frontend is served over HTTPS. + const thumbnailUrl = (() => { + if (!rawThumbnail) return null; + try { + const u = new URL(rawThumbnail); + if ( + u.protocol === "http:" && u.hostname !== "localhost" && + u.hostname !== "127.0.0.1" + ) { + return `${API_URL}/api/proxy-image?url=${ + encodeURIComponent(rawThumbnail) + }`; + } + } catch { /* relative URL */ } + return rawThumbnail; + })(); const fallbackIcon = dump.kind === "file" ? (() => { diff --git a/src/components/RichContentCard.tsx b/src/components/RichContentCard.tsx index 99d792e..d11a1ca 100644 --- a/src/components/RichContentCard.tsx +++ b/src/components/RichContentCard.tsx @@ -1,6 +1,21 @@ import { useContext } from "react"; import type { RichContent } from "../model.ts"; import { PlayerContext } from "../contexts/PlayerContext.ts"; +import { API_URL } from "../config/api.ts"; + +/** Route HTTP thumbnail URLs through the server proxy to avoid mixed-content blocks. */ +function proxyIfHttp(url: string): string { + try { + const u = new URL(url); + if ( + u.protocol === "http:" && u.hostname !== "localhost" && + u.hostname !== "127.0.0.1" + ) { + return `${API_URL}/api/proxy-image?url=${encodeURIComponent(url)}`; + } + } catch { /* relative URL — leave as-is */ } + return url; +} interface RichContentCardProps { richContent: RichContent; @@ -38,7 +53,7 @@ export default function RichContentCard( {richContent.thumbnailUrl ? ( <img - src={richContent.thumbnailUrl} + src={proxyIfHttp(richContent.thumbnailUrl!)} alt={richContent.title ?? ""} className="rich-content-compact-thumbnail" onError={(e) => { @@ -65,7 +80,7 @@ export default function RichContentCard( {richContent.thumbnailUrl ? ( <img - src={richContent.thumbnailUrl} + src={proxyIfHttp(richContent.thumbnailUrl!)} alt={richContent.title ?? ""} className="rich-content-compact-thumbnail" onError={(e) => { @@ -96,7 +111,7 @@ export default function RichContentCard( aria-label="Play" > <img - src={richContent.thumbnailUrl} + src={proxyIfHttp(richContent.thumbnailUrl!)} alt={richContent.title ?? ""} className="rich-content-thumbnail" onError={(e) => { @@ -108,7 +123,7 @@ export default function RichContentCard( ) : ( <img - src={richContent.thumbnailUrl} + src={proxyIfHttp(richContent.thumbnailUrl!)} alt={richContent.title ?? ""} className="rich-content-thumbnail" onError={(e) => {