v3: fixed rich content extraction heuristics
All checks were successful
Build and Publish Docker Image / build-and-push (push) Successful in 3m15s

This commit is contained in:
khannurien
2026-04-11 13:13:43 +00:00
parent b822f861ed
commit 34933a3d4f
6 changed files with 280 additions and 32 deletions

View File

@@ -25,7 +25,7 @@ RUN deno task build
# ── Stage 2: runtime ────────────────────────────────────────────────────────── # ── Stage 2: runtime ──────────────────────────────────────────────────────────
FROM denoland/deno:alpine-2.7.11 FROM denoland/deno:alpine-2.7.11
RUN apk add --no-cache ffmpeg RUN apk add --no-cache ffmpeg curl
WORKDIR /app WORKDIR /app

View File

@@ -1,6 +1,7 @@
import { Router } from "@oak/oak"; import { Router } from "@oak/oak";
import { import {
fetchRichContent, fetchRichContent,
fetchWithTimeout,
isValidHttpUrl, isValidHttpUrl,
} from "../services/rich-content-service.ts"; } from "../services/rich-content-service.ts";
import { APIErrorCode } from "../model/interfaces.ts"; import { APIErrorCode } from "../model/interfaces.ts";
@@ -21,4 +22,44 @@ previewRouter.get("/api/preview", async (ctx) => {
ctx.response.body = { success: true, data: data ?? null }; ctx.response.body = { success: true, data: data ?? null };
}); });
/**
* Proxy an external image through the server so HTTP thumbnail URLs don't
* trigger mixed-content blocks when the frontend is served over HTTPS.
*/
previewRouter.get("/api/proxy-image", async (ctx) => {
const url = ctx.request.url.searchParams.get("url") ?? "";
if (!isValidHttpUrl(url)) {
ctx.response.status = 400;
return;
}
try {
const res = await fetchWithTimeout(url, 8000);
const contentType = res.headers.get("content-type") ?? "";
if (!contentType.startsWith("image/")) {
ctx.response.status = 400;
return;
}
const MAX_SIZE = 5 * 1024 * 1024; // 5 MB
const contentLength = Number(res.headers.get("content-length") ?? "0");
if (contentLength > MAX_SIZE) {
ctx.response.status = 400;
return;
}
const bytes = new Uint8Array(await res.arrayBuffer());
if (bytes.length > MAX_SIZE) {
ctx.response.status = 400;
return;
}
ctx.response.headers.set("Content-Type", contentType);
ctx.response.headers.set("Cache-Control", "public, max-age=86400");
ctx.response.body = bytes;
} catch {
ctx.response.status = 502;
}
});
export default previewRouter; export default previewRouter;

View File

@@ -2,6 +2,7 @@ import type { RichContent } from "../../model/interfaces.ts";
import type { RichContentProvider } from "../rich-content-service.ts"; import type { RichContentProvider } from "../rich-content-service.ts";
import { import {
extractBestIcon, extractBestIcon,
extractFirstContentImage,
extractJsonLd, extractJsonLd,
extractLargeImage, extractLargeImage,
extractMetaName, extractMetaName,
@@ -28,30 +29,45 @@ export const genericProvider: RichContentProvider = {
const html = await res.text(); const html = await res.text();
const ld = extractJsonLd(html); const ld = extractJsonLd(html);
// Title: og:title → twitter:title → JSON-LD → <title> // If og:url is present but points to a different page (e.g. the homepage),
const title = extractOgTag(html, "title") ?? // the og: block is a site-level fallback, not page-specific metadata.
// In that case skip og:title and og:image so page-level signals win.
const ogUrl = extractOgTag(html, "url");
const useOg = !ogUrl || (() => {
try {
const ogPath = new URL(ogUrl).pathname.replace(/\/+$/, "") || "/";
const pagePath = new URL(url).pathname.replace(/\/+$/, "") || "/";
return ogPath === pagePath;
} catch {
return true;
}
})();
// Title: og:title (page-matched) → twitter:title → JSON-LD → <title>
const title = (useOg ? extractOgTag(html, "title") : undefined) ??
extractMetaName(html, "twitter:title") ?? extractMetaName(html, "twitter:title") ??
ld.title ?? ld.title ??
extractPageTitle(html); extractPageTitle(html);
// Site name: og:site_name → hostname
const siteName = extractOgTag(html, "site_name") ??
new URL(url).hostname.replace(/^www\./, "");
// Description: og:description → twitter:description → JSON-LD → <meta name="description"> // Description: og:description → twitter:description → JSON-LD → <meta name="description">
const description = extractOgTag(html, "description") ?? const description = extractOgTag(html, "description") ??
extractMetaName(html, "twitter:description") ?? extractMetaName(html, "twitter:description") ??
ld.description ?? ld.description ??
extractMetaName(html, "description"); extractMetaName(html, "description");
// Image: og:image → twitter:image → JSON-LD → first large <img> → best icon → /favicon.ico // Image: og:image (page-matched) → twitter:image → JSON-LD → large <img> → first content <img> → best icon → /favicon.ico
const thumbnailUrl = extractOgTag(html, "image") ?? const thumbnailUrl = (useOg ? extractOgTag(html, "image") : undefined) ??
extractMetaName(html, "twitter:image") ?? extractMetaName(html, "twitter:image") ??
ld.thumbnailUrl ?? ld.thumbnailUrl ??
extractLargeImage(html, url) ?? extractLargeImage(html, url) ??
extractFirstContentImage(html, url) ??
extractBestIcon(html, url) ?? extractBestIcon(html, url) ??
`${new URL(url).origin}/favicon.ico`; `${new URL(url).origin}/favicon.ico`;
// Site name: og:site_name → hostname
const siteName = extractOgTag(html, "site_name") ??
new URL(url).hostname.replace(/^www\./, "");
return { return {
type: "generic", type: "generic",
url, url,

View File

@@ -26,28 +26,116 @@ const providers: RichContentProvider[] = [
// Shared utilities exported for use by providers // Shared utilities exported for use by providers
const FETCH_HEADERS = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Accept":
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
};
/**
* Fetch `url` via a `curl --insecure` subprocess and return a minimal Response.
* Used as a last resort when Deno's rustls rejects the server's TLS certificate
* due to an unsupported algorithm (e.g. UnsupportedSignatureAlgorithm).
* Returns null if curl is unavailable or exits non-zero.
*/
async function fetchViaCurl(
url: string,
timeoutMs: number,
): Promise<Response | null> {
const tmpPath = await Deno.makeTempFile();
try {
const { code, stdout } = await new Deno.Command("curl", {
args: [
"--silent",
"--insecure",
"--location",
"--max-time",
String(Math.ceil(timeoutMs / 1000)),
"--user-agent",
FETCH_HEADERS["User-Agent"],
"--header",
`Accept: ${FETCH_HEADERS["Accept"]}`,
"--header",
`Accept-Language: ${FETCH_HEADERS["Accept-Language"]}`,
"--output",
tmpPath,
"--write-out",
"%{content_type}",
url,
],
stdout: "piped",
stderr: "null",
}).output();
if (code !== 0) return null;
const contentType = new TextDecoder().decode(stdout).trim();
const bytes = await Deno.readFile(tmpPath);
// Decode using the charset declared in the Content-Type header so that
// pages served in ISO-8859-1, windows-1252, etc. are read correctly.
const charset = /charset=([\w-]+)/i.exec(contentType)?.[1] ?? "utf-8";
let bodyText: string;
try {
bodyText = new TextDecoder(charset, { fatal: false }).decode(bytes);
} catch {
bodyText = new TextDecoder("utf-8", { fatal: false }).decode(bytes);
}
return new Response(bodyText, { headers: { "content-type": "text/html" } });
} catch {
return null;
} finally {
await Deno.remove(tmpPath).catch(() => {});
}
}
export async function fetchWithTimeout( export async function fetchWithTimeout(
url: string, url: string,
timeoutMs = 5000, timeoutMs = 5000,
): Promise<Response> {
async function attempt(
extraInit?: Record<string, unknown>,
): Promise<Response> { ): Promise<Response> {
const controller = new AbortController(); const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs); const timer = setTimeout(() => controller.abort(), timeoutMs);
try { try {
return await fetch(url, { return await fetch(url, {
signal: controller.signal, signal: controller.signal,
headers: { headers: FETCH_HEADERS,
"User-Agent": ...extraInit,
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", } as RequestInit);
"Accept":
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
},
});
} finally { } finally {
clearTimeout(timer); clearTimeout(timer);
} }
} }
try {
return await attempt();
} catch (err) {
if (!(err instanceof TypeError && err.message.includes("certificate"))) {
throw err;
}
// Retry 1: allowInsecureCertificates handles expired / self-signed certs.
const client = Deno.createHttpClient({ allowInsecureCertificates: true });
try {
return await attempt({ client });
} catch {
/* UnsupportedSignatureAlgorithm etc. — rustls can't help */
} finally {
client.close();
}
// Retry 2: curl uses its own TLS stack and supports a wider set of
// certificate algorithms that Deno/rustls rejects.
const curlRes = await fetchViaCurl(url, timeoutMs);
if (curlRes) return curlRes;
throw err;
}
}
function decodeHtmlEntities(str: string): string { function decodeHtmlEntities(str: string): string {
return str return str
.replace(/&amp;/gi, "&") .replace(/&amp;/gi, "&")
@@ -243,6 +331,66 @@ export function extractBestIcon(
return candidates[0].href; return candidates[0].href;
} }
/**
* Return the `src` of the first `<img>` that looks like content rather than UI
* chrome. Skips SVGs, data URIs, and images whose filename matches common
* icon/logo/nav patterns (logo, icon, sprite, favicon, avatar, banner, etc.).
* Resolves relative and protocol-relative URLs.
*/
const UI_IMAGE_KEYWORDS = new Set([
"logo",
"icon",
"sprite",
"favicon",
"avatar",
"banner",
"header",
"nav",
"menu",
"cart",
"search",
"tracking",
"pixel",
"bg",
"background",
]);
function isUiImage(src: string): boolean {
if (/\.svg(\?|$)/i.test(src)) return true;
const filename = src.split("?")[0].split("/").pop() ?? "";
const baseName = filename.replace(/\.[^.]+$/, ""); // strip extension
// Split on common filename separators (-, _, .) and check each token
return baseName.toLowerCase().split(/[-_.]/).some((t) =>
UI_IMAGE_KEYWORDS.has(t)
);
}
/**
* Return the `src` of the first `<img>` that looks like content rather than UI
* chrome. Skips SVGs, data URIs, and images whose filename tokens match common
* icon/logo/nav patterns (logo, icon, sprite, etc.).
* Resolves relative and protocol-relative URLs.
*/
export function extractFirstContentImage(
html: string,
baseUrl: string,
): string | undefined {
const imgPattern = /<img[^>]+>/gi;
let match: RegExpExecArray | null;
while ((match = imgPattern.exec(html)) !== null) {
const tag = match[0];
const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
if (!src || src.startsWith("data:")) continue;
if (isUiImage(src)) continue;
try {
return new URL(src, baseUrl).toString();
} catch {
continue;
}
}
return undefined;
}
/** /**
* Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`, * Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`,
* resolved to an absolute URL using `baseUrl`. * resolved to an absolute URL using `baseUrl`.
@@ -307,6 +455,16 @@ export async function fetchRichContent(
return await provider.fetch(url); return await provider.fetch(url);
} catch (err) { } catch (err) {
console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err); console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err);
// Return a minimal stub so the caller always gets something displayable
// (e.g. when the site has a bad TLS cert or the fetch times out).
try {
return {
type: "generic",
url,
siteName: new URL(url).hostname.replace(/^www\./, ""),
};
} catch {
return undefined; return undefined;
} }
} }
}

View File

@@ -38,11 +38,29 @@ export function JournalCard(
navigate(dumpUrl(dump)); navigate(dumpUrl(dump));
} }
const thumbnailUrl = dump.kind === "file" && const rawThumbnail =
dump.fileMime?.startsWith("image/") dump.kind === "file" && dump.fileMime?.startsWith("image/")
? `${API_URL}/api/files/${dump.id}?v=${dump.fileSize ?? 0}` ? `${API_URL}/api/files/${dump.id}?v=${dump.fileSize ?? 0}`
: (dump.richContent?.thumbnailUrl ?? null); : (dump.richContent?.thumbnailUrl ?? null);
// Route external HTTP thumbnails through the server proxy to avoid
// mixed-content blocks when the frontend is served over HTTPS.
const thumbnailUrl = (() => {
if (!rawThumbnail) return null;
try {
const u = new URL(rawThumbnail);
if (
u.protocol === "http:" && u.hostname !== "localhost" &&
u.hostname !== "127.0.0.1"
) {
return `${API_URL}/api/proxy-image?url=${
encodeURIComponent(rawThumbnail)
}`;
}
} catch { /* relative URL */ }
return rawThumbnail;
})();
const fallbackIcon = dump.kind === "file" const fallbackIcon = dump.kind === "file"
? (() => { ? (() => {
const m = dump.fileMime ?? ""; const m = dump.fileMime ?? "";

View File

@@ -1,6 +1,21 @@
import { useContext } from "react"; import { useContext } from "react";
import type { RichContent } from "../model.ts"; import type { RichContent } from "../model.ts";
import { PlayerContext } from "../contexts/PlayerContext.ts"; import { PlayerContext } from "../contexts/PlayerContext.ts";
import { API_URL } from "../config/api.ts";
/** Route HTTP thumbnail URLs through the server proxy to avoid mixed-content blocks. */
function proxyIfHttp(url: string): string {
try {
const u = new URL(url);
if (
u.protocol === "http:" && u.hostname !== "localhost" &&
u.hostname !== "127.0.0.1"
) {
return `${API_URL}/api/proxy-image?url=${encodeURIComponent(url)}`;
}
} catch { /* relative URL — leave as-is */ }
return url;
}
interface RichContentCardProps { interface RichContentCardProps {
richContent: RichContent; richContent: RichContent;
@@ -38,7 +53,7 @@ export default function RichContentCard(
{richContent.thumbnailUrl {richContent.thumbnailUrl
? ( ? (
<img <img
src={richContent.thumbnailUrl} src={proxyIfHttp(richContent.thumbnailUrl!)}
alt={richContent.title ?? ""} alt={richContent.title ?? ""}
className="rich-content-compact-thumbnail" className="rich-content-compact-thumbnail"
onError={(e) => { onError={(e) => {
@@ -65,7 +80,7 @@ export default function RichContentCard(
{richContent.thumbnailUrl {richContent.thumbnailUrl
? ( ? (
<img <img
src={richContent.thumbnailUrl} src={proxyIfHttp(richContent.thumbnailUrl!)}
alt={richContent.title ?? ""} alt={richContent.title ?? ""}
className="rich-content-compact-thumbnail" className="rich-content-compact-thumbnail"
onError={(e) => { onError={(e) => {
@@ -96,7 +111,7 @@ export default function RichContentCard(
aria-label="Play" aria-label="Play"
> >
<img <img
src={richContent.thumbnailUrl} src={proxyIfHttp(richContent.thumbnailUrl!)}
alt={richContent.title ?? ""} alt={richContent.title ?? ""}
className="rich-content-thumbnail" className="rich-content-thumbnail"
onError={(e) => { onError={(e) => {
@@ -108,7 +123,7 @@ export default function RichContentCard(
) )
: ( : (
<img <img
src={richContent.thumbnailUrl} src={proxyIfHttp(richContent.thumbnailUrl!)}
alt={richContent.title ?? ""} alt={richContent.title ?? ""}
className="rich-content-thumbnail" className="rich-content-thumbnail"
onError={(e) => { onError={(e) => {