v3: fixed rich content extraction heuristics
All checks were successful
Build and Publish Docker Image / build-and-push (push) Successful in 3m15s
All checks were successful
Build and Publish Docker Image / build-and-push (push) Successful in 3m15s
This commit is contained in:
@@ -25,7 +25,7 @@ RUN deno task build
|
|||||||
# ── Stage 2: runtime ──────────────────────────────────────────────────────────
|
# ── Stage 2: runtime ──────────────────────────────────────────────────────────
|
||||||
FROM denoland/deno:alpine-2.7.11
|
FROM denoland/deno:alpine-2.7.11
|
||||||
|
|
||||||
RUN apk add --no-cache ffmpeg
|
RUN apk add --no-cache ffmpeg curl
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import { Router } from "@oak/oak";
|
import { Router } from "@oak/oak";
|
||||||
import {
|
import {
|
||||||
fetchRichContent,
|
fetchRichContent,
|
||||||
|
fetchWithTimeout,
|
||||||
isValidHttpUrl,
|
isValidHttpUrl,
|
||||||
} from "../services/rich-content-service.ts";
|
} from "../services/rich-content-service.ts";
|
||||||
import { APIErrorCode } from "../model/interfaces.ts";
|
import { APIErrorCode } from "../model/interfaces.ts";
|
||||||
@@ -21,4 +22,44 @@ previewRouter.get("/api/preview", async (ctx) => {
|
|||||||
ctx.response.body = { success: true, data: data ?? null };
|
ctx.response.body = { success: true, data: data ?? null };
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Proxy an external image through the server so HTTP thumbnail URLs don't
|
||||||
|
* trigger mixed-content blocks when the frontend is served over HTTPS.
|
||||||
|
*/
|
||||||
|
previewRouter.get("/api/proxy-image", async (ctx) => {
|
||||||
|
const url = ctx.request.url.searchParams.get("url") ?? "";
|
||||||
|
if (!isValidHttpUrl(url)) {
|
||||||
|
ctx.response.status = 400;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetchWithTimeout(url, 8000);
|
||||||
|
const contentType = res.headers.get("content-type") ?? "";
|
||||||
|
if (!contentType.startsWith("image/")) {
|
||||||
|
ctx.response.status = 400;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const MAX_SIZE = 5 * 1024 * 1024; // 5 MB
|
||||||
|
const contentLength = Number(res.headers.get("content-length") ?? "0");
|
||||||
|
if (contentLength > MAX_SIZE) {
|
||||||
|
ctx.response.status = 400;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bytes = new Uint8Array(await res.arrayBuffer());
|
||||||
|
if (bytes.length > MAX_SIZE) {
|
||||||
|
ctx.response.status = 400;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx.response.headers.set("Content-Type", contentType);
|
||||||
|
ctx.response.headers.set("Cache-Control", "public, max-age=86400");
|
||||||
|
ctx.response.body = bytes;
|
||||||
|
} catch {
|
||||||
|
ctx.response.status = 502;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
export default previewRouter;
|
export default previewRouter;
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import type { RichContent } from "../../model/interfaces.ts";
|
|||||||
import type { RichContentProvider } from "../rich-content-service.ts";
|
import type { RichContentProvider } from "../rich-content-service.ts";
|
||||||
import {
|
import {
|
||||||
extractBestIcon,
|
extractBestIcon,
|
||||||
|
extractFirstContentImage,
|
||||||
extractJsonLd,
|
extractJsonLd,
|
||||||
extractLargeImage,
|
extractLargeImage,
|
||||||
extractMetaName,
|
extractMetaName,
|
||||||
@@ -28,30 +29,45 @@ export const genericProvider: RichContentProvider = {
|
|||||||
const html = await res.text();
|
const html = await res.text();
|
||||||
const ld = extractJsonLd(html);
|
const ld = extractJsonLd(html);
|
||||||
|
|
||||||
// Title: og:title → twitter:title → JSON-LD → <title>
|
// If og:url is present but points to a different page (e.g. the homepage),
|
||||||
const title = extractOgTag(html, "title") ??
|
// the og: block is a site-level fallback, not page-specific metadata.
|
||||||
|
// In that case skip og:title and og:image so page-level signals win.
|
||||||
|
const ogUrl = extractOgTag(html, "url");
|
||||||
|
const useOg = !ogUrl || (() => {
|
||||||
|
try {
|
||||||
|
const ogPath = new URL(ogUrl).pathname.replace(/\/+$/, "") || "/";
|
||||||
|
const pagePath = new URL(url).pathname.replace(/\/+$/, "") || "/";
|
||||||
|
return ogPath === pagePath;
|
||||||
|
} catch {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
// Title: og:title (page-matched) → twitter:title → JSON-LD → <title>
|
||||||
|
const title = (useOg ? extractOgTag(html, "title") : undefined) ??
|
||||||
extractMetaName(html, "twitter:title") ??
|
extractMetaName(html, "twitter:title") ??
|
||||||
ld.title ??
|
ld.title ??
|
||||||
extractPageTitle(html);
|
extractPageTitle(html);
|
||||||
|
|
||||||
|
// Site name: og:site_name → hostname
|
||||||
|
const siteName = extractOgTag(html, "site_name") ??
|
||||||
|
new URL(url).hostname.replace(/^www\./, "");
|
||||||
|
|
||||||
// Description: og:description → twitter:description → JSON-LD → <meta name="description">
|
// Description: og:description → twitter:description → JSON-LD → <meta name="description">
|
||||||
const description = extractOgTag(html, "description") ??
|
const description = extractOgTag(html, "description") ??
|
||||||
extractMetaName(html, "twitter:description") ??
|
extractMetaName(html, "twitter:description") ??
|
||||||
ld.description ??
|
ld.description ??
|
||||||
extractMetaName(html, "description");
|
extractMetaName(html, "description");
|
||||||
|
|
||||||
// Image: og:image → twitter:image → JSON-LD → first large <img> → best icon → /favicon.ico
|
// Image: og:image (page-matched) → twitter:image → JSON-LD → large <img> → first content <img> → best icon → /favicon.ico
|
||||||
const thumbnailUrl = extractOgTag(html, "image") ??
|
const thumbnailUrl = (useOg ? extractOgTag(html, "image") : undefined) ??
|
||||||
extractMetaName(html, "twitter:image") ??
|
extractMetaName(html, "twitter:image") ??
|
||||||
ld.thumbnailUrl ??
|
ld.thumbnailUrl ??
|
||||||
extractLargeImage(html, url) ??
|
extractLargeImage(html, url) ??
|
||||||
|
extractFirstContentImage(html, url) ??
|
||||||
extractBestIcon(html, url) ??
|
extractBestIcon(html, url) ??
|
||||||
`${new URL(url).origin}/favicon.ico`;
|
`${new URL(url).origin}/favicon.ico`;
|
||||||
|
|
||||||
// Site name: og:site_name → hostname
|
|
||||||
const siteName = extractOgTag(html, "site_name") ??
|
|
||||||
new URL(url).hostname.replace(/^www\./, "");
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
type: "generic",
|
type: "generic",
|
||||||
url,
|
url,
|
||||||
|
|||||||
@@ -26,28 +26,116 @@ const providers: RichContentProvider[] = [
|
|||||||
|
|
||||||
// Shared utilities exported for use by providers
|
// Shared utilities exported for use by providers
|
||||||
|
|
||||||
|
const FETCH_HEADERS = {
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||||||
|
"Accept":
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch `url` via a `curl --insecure` subprocess and return a minimal Response.
|
||||||
|
* Used as a last resort when Deno's rustls rejects the server's TLS certificate
|
||||||
|
* due to an unsupported algorithm (e.g. UnsupportedSignatureAlgorithm).
|
||||||
|
* Returns null if curl is unavailable or exits non-zero.
|
||||||
|
*/
|
||||||
|
async function fetchViaCurl(
|
||||||
|
url: string,
|
||||||
|
timeoutMs: number,
|
||||||
|
): Promise<Response | null> {
|
||||||
|
const tmpPath = await Deno.makeTempFile();
|
||||||
|
try {
|
||||||
|
const { code, stdout } = await new Deno.Command("curl", {
|
||||||
|
args: [
|
||||||
|
"--silent",
|
||||||
|
"--insecure",
|
||||||
|
"--location",
|
||||||
|
"--max-time",
|
||||||
|
String(Math.ceil(timeoutMs / 1000)),
|
||||||
|
"--user-agent",
|
||||||
|
FETCH_HEADERS["User-Agent"],
|
||||||
|
"--header",
|
||||||
|
`Accept: ${FETCH_HEADERS["Accept"]}`,
|
||||||
|
"--header",
|
||||||
|
`Accept-Language: ${FETCH_HEADERS["Accept-Language"]}`,
|
||||||
|
"--output",
|
||||||
|
tmpPath,
|
||||||
|
"--write-out",
|
||||||
|
"%{content_type}",
|
||||||
|
url,
|
||||||
|
],
|
||||||
|
stdout: "piped",
|
||||||
|
stderr: "null",
|
||||||
|
}).output();
|
||||||
|
|
||||||
|
if (code !== 0) return null;
|
||||||
|
|
||||||
|
const contentType = new TextDecoder().decode(stdout).trim();
|
||||||
|
const bytes = await Deno.readFile(tmpPath);
|
||||||
|
// Decode using the charset declared in the Content-Type header so that
|
||||||
|
// pages served in ISO-8859-1, windows-1252, etc. are read correctly.
|
||||||
|
const charset = /charset=([\w-]+)/i.exec(contentType)?.[1] ?? "utf-8";
|
||||||
|
let bodyText: string;
|
||||||
|
try {
|
||||||
|
bodyText = new TextDecoder(charset, { fatal: false }).decode(bytes);
|
||||||
|
} catch {
|
||||||
|
bodyText = new TextDecoder("utf-8", { fatal: false }).decode(bytes);
|
||||||
|
}
|
||||||
|
return new Response(bodyText, { headers: { "content-type": "text/html" } });
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
await Deno.remove(tmpPath).catch(() => {});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function fetchWithTimeout(
|
export async function fetchWithTimeout(
|
||||||
url: string,
|
url: string,
|
||||||
timeoutMs = 5000,
|
timeoutMs = 5000,
|
||||||
|
): Promise<Response> {
|
||||||
|
async function attempt(
|
||||||
|
extraInit?: Record<string, unknown>,
|
||||||
): Promise<Response> {
|
): Promise<Response> {
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
||||||
try {
|
try {
|
||||||
return await fetch(url, {
|
return await fetch(url, {
|
||||||
signal: controller.signal,
|
signal: controller.signal,
|
||||||
headers: {
|
headers: FETCH_HEADERS,
|
||||||
"User-Agent":
|
...extraInit,
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
} as RequestInit);
|
||||||
"Accept":
|
|
||||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
||||||
},
|
|
||||||
});
|
|
||||||
} finally {
|
} finally {
|
||||||
clearTimeout(timer);
|
clearTimeout(timer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return await attempt();
|
||||||
|
} catch (err) {
|
||||||
|
if (!(err instanceof TypeError && err.message.includes("certificate"))) {
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retry 1: allowInsecureCertificates handles expired / self-signed certs.
|
||||||
|
const client = Deno.createHttpClient({ allowInsecureCertificates: true });
|
||||||
|
try {
|
||||||
|
return await attempt({ client });
|
||||||
|
} catch {
|
||||||
|
/* UnsupportedSignatureAlgorithm etc. — rustls can't help */
|
||||||
|
} finally {
|
||||||
|
client.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retry 2: curl uses its own TLS stack and supports a wider set of
|
||||||
|
// certificate algorithms that Deno/rustls rejects.
|
||||||
|
const curlRes = await fetchViaCurl(url, timeoutMs);
|
||||||
|
if (curlRes) return curlRes;
|
||||||
|
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function decodeHtmlEntities(str: string): string {
|
function decodeHtmlEntities(str: string): string {
|
||||||
return str
|
return str
|
||||||
.replace(/&/gi, "&")
|
.replace(/&/gi, "&")
|
||||||
@@ -243,6 +331,66 @@ export function extractBestIcon(
|
|||||||
return candidates[0].href;
|
return candidates[0].href;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the `src` of the first `<img>` that looks like content rather than UI
|
||||||
|
* chrome. Skips SVGs, data URIs, and images whose filename matches common
|
||||||
|
* icon/logo/nav patterns (logo, icon, sprite, favicon, avatar, banner, etc.).
|
||||||
|
* Resolves relative and protocol-relative URLs.
|
||||||
|
*/
|
||||||
|
const UI_IMAGE_KEYWORDS = new Set([
|
||||||
|
"logo",
|
||||||
|
"icon",
|
||||||
|
"sprite",
|
||||||
|
"favicon",
|
||||||
|
"avatar",
|
||||||
|
"banner",
|
||||||
|
"header",
|
||||||
|
"nav",
|
||||||
|
"menu",
|
||||||
|
"cart",
|
||||||
|
"search",
|
||||||
|
"tracking",
|
||||||
|
"pixel",
|
||||||
|
"bg",
|
||||||
|
"background",
|
||||||
|
]);
|
||||||
|
|
||||||
|
function isUiImage(src: string): boolean {
|
||||||
|
if (/\.svg(\?|$)/i.test(src)) return true;
|
||||||
|
const filename = src.split("?")[0].split("/").pop() ?? "";
|
||||||
|
const baseName = filename.replace(/\.[^.]+$/, ""); // strip extension
|
||||||
|
// Split on common filename separators (-, _, .) and check each token
|
||||||
|
return baseName.toLowerCase().split(/[-_.]/).some((t) =>
|
||||||
|
UI_IMAGE_KEYWORDS.has(t)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the `src` of the first `<img>` that looks like content rather than UI
|
||||||
|
* chrome. Skips SVGs, data URIs, and images whose filename tokens match common
|
||||||
|
* icon/logo/nav patterns (logo, icon, sprite, etc.).
|
||||||
|
* Resolves relative and protocol-relative URLs.
|
||||||
|
*/
|
||||||
|
export function extractFirstContentImage(
|
||||||
|
html: string,
|
||||||
|
baseUrl: string,
|
||||||
|
): string | undefined {
|
||||||
|
const imgPattern = /<img[^>]+>/gi;
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
while ((match = imgPattern.exec(html)) !== null) {
|
||||||
|
const tag = match[0];
|
||||||
|
const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
|
||||||
|
if (!src || src.startsWith("data:")) continue;
|
||||||
|
if (isUiImage(src)) continue;
|
||||||
|
try {
|
||||||
|
return new URL(src, baseUrl).toString();
|
||||||
|
} catch {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`,
|
* Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`,
|
||||||
* resolved to an absolute URL using `baseUrl`.
|
* resolved to an absolute URL using `baseUrl`.
|
||||||
@@ -307,6 +455,16 @@ export async function fetchRichContent(
|
|||||||
return await provider.fetch(url);
|
return await provider.fetch(url);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err);
|
console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err);
|
||||||
|
// Return a minimal stub so the caller always gets something displayable
|
||||||
|
// (e.g. when the site has a bad TLS cert or the fetch times out).
|
||||||
|
try {
|
||||||
|
return {
|
||||||
|
type: "generic",
|
||||||
|
url,
|
||||||
|
siteName: new URL(url).hostname.replace(/^www\./, ""),
|
||||||
|
};
|
||||||
|
} catch {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -38,11 +38,29 @@ export function JournalCard(
|
|||||||
navigate(dumpUrl(dump));
|
navigate(dumpUrl(dump));
|
||||||
}
|
}
|
||||||
|
|
||||||
const thumbnailUrl = dump.kind === "file" &&
|
const rawThumbnail =
|
||||||
dump.fileMime?.startsWith("image/")
|
dump.kind === "file" && dump.fileMime?.startsWith("image/")
|
||||||
? `${API_URL}/api/files/${dump.id}?v=${dump.fileSize ?? 0}`
|
? `${API_URL}/api/files/${dump.id}?v=${dump.fileSize ?? 0}`
|
||||||
: (dump.richContent?.thumbnailUrl ?? null);
|
: (dump.richContent?.thumbnailUrl ?? null);
|
||||||
|
|
||||||
|
// Route external HTTP thumbnails through the server proxy to avoid
|
||||||
|
// mixed-content blocks when the frontend is served over HTTPS.
|
||||||
|
const thumbnailUrl = (() => {
|
||||||
|
if (!rawThumbnail) return null;
|
||||||
|
try {
|
||||||
|
const u = new URL(rawThumbnail);
|
||||||
|
if (
|
||||||
|
u.protocol === "http:" && u.hostname !== "localhost" &&
|
||||||
|
u.hostname !== "127.0.0.1"
|
||||||
|
) {
|
||||||
|
return `${API_URL}/api/proxy-image?url=${
|
||||||
|
encodeURIComponent(rawThumbnail)
|
||||||
|
}`;
|
||||||
|
}
|
||||||
|
} catch { /* relative URL */ }
|
||||||
|
return rawThumbnail;
|
||||||
|
})();
|
||||||
|
|
||||||
const fallbackIcon = dump.kind === "file"
|
const fallbackIcon = dump.kind === "file"
|
||||||
? (() => {
|
? (() => {
|
||||||
const m = dump.fileMime ?? "";
|
const m = dump.fileMime ?? "";
|
||||||
|
|||||||
@@ -1,6 +1,21 @@
|
|||||||
import { useContext } from "react";
|
import { useContext } from "react";
|
||||||
import type { RichContent } from "../model.ts";
|
import type { RichContent } from "../model.ts";
|
||||||
import { PlayerContext } from "../contexts/PlayerContext.ts";
|
import { PlayerContext } from "../contexts/PlayerContext.ts";
|
||||||
|
import { API_URL } from "../config/api.ts";
|
||||||
|
|
||||||
|
/** Route HTTP thumbnail URLs through the server proxy to avoid mixed-content blocks. */
|
||||||
|
function proxyIfHttp(url: string): string {
|
||||||
|
try {
|
||||||
|
const u = new URL(url);
|
||||||
|
if (
|
||||||
|
u.protocol === "http:" && u.hostname !== "localhost" &&
|
||||||
|
u.hostname !== "127.0.0.1"
|
||||||
|
) {
|
||||||
|
return `${API_URL}/api/proxy-image?url=${encodeURIComponent(url)}`;
|
||||||
|
}
|
||||||
|
} catch { /* relative URL — leave as-is */ }
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
interface RichContentCardProps {
|
interface RichContentCardProps {
|
||||||
richContent: RichContent;
|
richContent: RichContent;
|
||||||
@@ -38,7 +53,7 @@ export default function RichContentCard(
|
|||||||
{richContent.thumbnailUrl
|
{richContent.thumbnailUrl
|
||||||
? (
|
? (
|
||||||
<img
|
<img
|
||||||
src={richContent.thumbnailUrl}
|
src={proxyIfHttp(richContent.thumbnailUrl!)}
|
||||||
alt={richContent.title ?? ""}
|
alt={richContent.title ?? ""}
|
||||||
className="rich-content-compact-thumbnail"
|
className="rich-content-compact-thumbnail"
|
||||||
onError={(e) => {
|
onError={(e) => {
|
||||||
@@ -65,7 +80,7 @@ export default function RichContentCard(
|
|||||||
{richContent.thumbnailUrl
|
{richContent.thumbnailUrl
|
||||||
? (
|
? (
|
||||||
<img
|
<img
|
||||||
src={richContent.thumbnailUrl}
|
src={proxyIfHttp(richContent.thumbnailUrl!)}
|
||||||
alt={richContent.title ?? ""}
|
alt={richContent.title ?? ""}
|
||||||
className="rich-content-compact-thumbnail"
|
className="rich-content-compact-thumbnail"
|
||||||
onError={(e) => {
|
onError={(e) => {
|
||||||
@@ -96,7 +111,7 @@ export default function RichContentCard(
|
|||||||
aria-label="Play"
|
aria-label="Play"
|
||||||
>
|
>
|
||||||
<img
|
<img
|
||||||
src={richContent.thumbnailUrl}
|
src={proxyIfHttp(richContent.thumbnailUrl!)}
|
||||||
alt={richContent.title ?? ""}
|
alt={richContent.title ?? ""}
|
||||||
className="rich-content-thumbnail"
|
className="rich-content-thumbnail"
|
||||||
onError={(e) => {
|
onError={(e) => {
|
||||||
@@ -108,7 +123,7 @@ export default function RichContentCard(
|
|||||||
)
|
)
|
||||||
: (
|
: (
|
||||||
<img
|
<img
|
||||||
src={richContent.thumbnailUrl}
|
src={proxyIfHttp(richContent.thumbnailUrl!)}
|
||||||
alt={richContent.title ?? ""}
|
alt={richContent.title ?? ""}
|
||||||
className="rich-content-thumbnail"
|
className="rich-content-thumbnail"
|
||||||
onError={(e) => {
|
onError={(e) => {
|
||||||
|
|||||||
Reference in New Issue
Block a user