All checks were successful
Build and Publish Docker Image / build-and-push (push) Successful in 3m15s
471 lines
14 KiB
TypeScript
471 lines
14 KiB
TypeScript
import type { RichContent } from "../model/interfaces.ts";
|
|
import { youtubeProvider } from "./providers/youtube.ts";
|
|
import { bandcampProvider } from "./providers/bandcamp.ts";
|
|
import { soundcloudProvider } from "./providers/soundcloud.ts";
|
|
import { selfProvider } from "./providers/self.ts";
|
|
import { genericProvider } from "./providers/generic.ts";
|
|
|
|
export interface RichContentProvider {
|
|
name: string;
|
|
matches(url: string): boolean;
|
|
fetch(url: string): Promise<RichContent>;
|
|
}
|
|
|
|
/**
|
|
* Register providers in priority order. The first match wins.
|
|
* `selfProvider` resolves gerbeur URLs directly from the DB (no HTTP round-trip).
|
|
* `genericProvider` must stay last — it always matches.
|
|
*/
|
|
const providers: RichContentProvider[] = [
|
|
youtubeProvider,
|
|
bandcampProvider,
|
|
soundcloudProvider,
|
|
selfProvider,
|
|
genericProvider,
|
|
];
|
|
|
|
// Shared utilities exported for use by providers
|
|
|
|
const FETCH_HEADERS = {
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
|
"Accept":
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
};
|
|
|
|
/**
|
|
* Fetch `url` via a `curl --insecure` subprocess and return a minimal Response.
|
|
* Used as a last resort when Deno's rustls rejects the server's TLS certificate
|
|
* due to an unsupported algorithm (e.g. UnsupportedSignatureAlgorithm).
|
|
* Returns null if curl is unavailable or exits non-zero.
|
|
*/
|
|
async function fetchViaCurl(
|
|
url: string,
|
|
timeoutMs: number,
|
|
): Promise<Response | null> {
|
|
const tmpPath = await Deno.makeTempFile();
|
|
try {
|
|
const { code, stdout } = await new Deno.Command("curl", {
|
|
args: [
|
|
"--silent",
|
|
"--insecure",
|
|
"--location",
|
|
"--max-time",
|
|
String(Math.ceil(timeoutMs / 1000)),
|
|
"--user-agent",
|
|
FETCH_HEADERS["User-Agent"],
|
|
"--header",
|
|
`Accept: ${FETCH_HEADERS["Accept"]}`,
|
|
"--header",
|
|
`Accept-Language: ${FETCH_HEADERS["Accept-Language"]}`,
|
|
"--output",
|
|
tmpPath,
|
|
"--write-out",
|
|
"%{content_type}",
|
|
url,
|
|
],
|
|
stdout: "piped",
|
|
stderr: "null",
|
|
}).output();
|
|
|
|
if (code !== 0) return null;
|
|
|
|
const contentType = new TextDecoder().decode(stdout).trim();
|
|
const bytes = await Deno.readFile(tmpPath);
|
|
// Decode using the charset declared in the Content-Type header so that
|
|
// pages served in ISO-8859-1, windows-1252, etc. are read correctly.
|
|
const charset = /charset=([\w-]+)/i.exec(contentType)?.[1] ?? "utf-8";
|
|
let bodyText: string;
|
|
try {
|
|
bodyText = new TextDecoder(charset, { fatal: false }).decode(bytes);
|
|
} catch {
|
|
bodyText = new TextDecoder("utf-8", { fatal: false }).decode(bytes);
|
|
}
|
|
return new Response(bodyText, { headers: { "content-type": "text/html" } });
|
|
} catch {
|
|
return null;
|
|
} finally {
|
|
await Deno.remove(tmpPath).catch(() => {});
|
|
}
|
|
}
|
|
|
|
export async function fetchWithTimeout(
|
|
url: string,
|
|
timeoutMs = 5000,
|
|
): Promise<Response> {
|
|
async function attempt(
|
|
extraInit?: Record<string, unknown>,
|
|
): Promise<Response> {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
try {
|
|
return await fetch(url, {
|
|
signal: controller.signal,
|
|
headers: FETCH_HEADERS,
|
|
...extraInit,
|
|
} as RequestInit);
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
try {
|
|
return await attempt();
|
|
} catch (err) {
|
|
if (!(err instanceof TypeError && err.message.includes("certificate"))) {
|
|
throw err;
|
|
}
|
|
|
|
// Retry 1: allowInsecureCertificates handles expired / self-signed certs.
|
|
const client = Deno.createHttpClient({ allowInsecureCertificates: true });
|
|
try {
|
|
return await attempt({ client });
|
|
} catch {
|
|
/* UnsupportedSignatureAlgorithm etc. — rustls can't help */
|
|
} finally {
|
|
client.close();
|
|
}
|
|
|
|
// Retry 2: curl uses its own TLS stack and supports a wider set of
|
|
// certificate algorithms that Deno/rustls rejects.
|
|
const curlRes = await fetchViaCurl(url, timeoutMs);
|
|
if (curlRes) return curlRes;
|
|
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
function decodeHtmlEntities(str: string): string {
|
|
return str
|
|
.replace(/&/gi, "&")
|
|
.replace(/</gi, "<")
|
|
.replace(/>/gi, ">")
|
|
.replace(/"/gi, '"')
|
|
.replace(/'/gi, "'")
|
|
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number(dec)))
|
|
.replace(
|
|
/&#x([0-9a-f]+);/gi,
|
|
(_, hex) => String.fromCodePoint(parseInt(hex, 16)),
|
|
);
|
|
}
|
|
|
|
export function extractOgTag(
|
|
html: string,
|
|
tag: string,
|
|
): string | undefined {
|
|
const patterns = [
|
|
new RegExp(
|
|
`<meta[^>]+property=["']og:${tag}["'][^>]+content=["']([^"']+)["']`,
|
|
"i",
|
|
),
|
|
new RegExp(
|
|
`<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:${tag}["']`,
|
|
"i",
|
|
),
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = html.match(pattern);
|
|
if (match) return decodeHtmlEntities(match[1]);
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
/** Extract content from `<meta name="…" content="…">` (both attribute orderings). */
|
|
export function extractMetaName(
|
|
html: string,
|
|
name: string,
|
|
): string | undefined {
|
|
const escaped = name.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
const patterns = [
|
|
new RegExp(
|
|
`<meta[^>]+name=["']${escaped}["'][^>]+content=["']([^"']+)["']`,
|
|
"i",
|
|
),
|
|
new RegExp(
|
|
`<meta[^>]+content=["']([^"']+)["'][^>]+name=["']${escaped}["']`,
|
|
"i",
|
|
),
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = html.match(pattern);
|
|
if (match) return decodeHtmlEntities(match[1]);
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
/** Extract the text content of the `<title>` element. */
|
|
export function extractPageTitle(html: string): string | undefined {
|
|
const match = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
return match ? decodeHtmlEntities(match[1].trim()) : undefined;
|
|
}
|
|
|
|
// ── JSON-LD helpers (file-private) ────────────────────────────────────────────
|
|
|
|
type JsonLdResult = {
|
|
title?: string;
|
|
description?: string;
|
|
thumbnailUrl?: string;
|
|
};
|
|
|
|
function ldString(v: unknown): string | undefined {
|
|
if (typeof v === "string" && v.trim()) return v.trim();
|
|
if (Array.isArray(v) && typeof v[0] === "string" && v[0].trim()) {
|
|
return v[0].trim();
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function ldImage(v: unknown): string | undefined {
|
|
if (
|
|
typeof v === "string" &&
|
|
(v.startsWith("http://") || v.startsWith("https://"))
|
|
) return v;
|
|
if (Array.isArray(v)) return ldImage(v[0]);
|
|
if (v && typeof v === "object") {
|
|
const o = v as Record<string, unknown>;
|
|
return ldImage(o.url ?? o.contentUrl);
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function ldExtractNode(data: unknown): JsonLdResult {
|
|
if (Array.isArray(data)) {
|
|
for (const item of data) {
|
|
const r = ldExtractNode(item);
|
|
if (r.title || r.thumbnailUrl) return r;
|
|
}
|
|
return {};
|
|
}
|
|
if (!data || typeof data !== "object") return {};
|
|
const o = data as Record<string, unknown>;
|
|
if (o["@graph"]) return ldExtractNode(o["@graph"]);
|
|
return {
|
|
title: ldString(o.name ?? o.headline),
|
|
description: ldString(o.description),
|
|
thumbnailUrl: ldImage(o.image ?? o.thumbnailUrl ?? o.thumbnail),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parse every `<script type="application/ld+json">` block and return the first
|
|
* node that yields a title or image. Handles `@graph`, arrays, and the common
|
|
* `image` shapes (string, string[], ImageObject).
|
|
*/
|
|
export function extractJsonLd(html: string): JsonLdResult {
|
|
const pattern =
|
|
/<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
let match: RegExpExecArray | null;
|
|
while ((match = pattern.exec(html)) !== null) {
|
|
try {
|
|
const result = ldExtractNode(JSON.parse(match[1]));
|
|
if (result.title || result.thumbnailUrl) return result;
|
|
} catch { /* invalid JSON — skip */ }
|
|
}
|
|
return {};
|
|
}
|
|
|
|
/**
|
|
* Return the `src` of the first `<img>` whose declared width or height is at
|
|
* least `minSize` pixels (default 200). Skips data URIs. Resolves relative URLs.
|
|
*/
|
|
export function extractLargeImage(
|
|
html: string,
|
|
baseUrl: string,
|
|
minSize = 200,
|
|
): string | undefined {
|
|
const imgPattern = /<img[^>]+>/gi;
|
|
let match: RegExpExecArray | null;
|
|
while ((match = imgPattern.exec(html)) !== null) {
|
|
const tag = match[0];
|
|
const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
|
|
if (!src || src.startsWith("data:")) continue;
|
|
const w = parseInt(/\bwidth=["']?(\d+)/i.exec(tag)?.[1] ?? "0");
|
|
const h = parseInt(/\bheight=["']?(\d+)/i.exec(tag)?.[1] ?? "0");
|
|
if (w >= minSize && h >= minSize) {
|
|
try {
|
|
return new URL(src, baseUrl).toString();
|
|
} catch {
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Collect all `<link rel="icon">` / `<link rel="apple-touch-icon">` tags, rank
|
|
* them by declared size (largest wins), and return the best resolved URL.
|
|
* Falls back to the first match when no `sizes` attribute is present.
|
|
*/
|
|
export function extractBestIcon(
|
|
html: string,
|
|
baseUrl: string,
|
|
): string | undefined {
|
|
const linkRe = /<link[^>]+>/gi;
|
|
const relRe = /\brel=["']([^"']+)["']/i;
|
|
const hrefRe = /\bhref=["']([^"']+)["']/i;
|
|
const sizesRe = /\bsizes=["']([^"']+)["']/i;
|
|
|
|
const candidates: { href: string; area: number }[] = [];
|
|
|
|
let m: RegExpExecArray | null;
|
|
while ((m = linkRe.exec(html)) !== null) {
|
|
const tag = m[0];
|
|
const rel = relRe.exec(tag)?.[1] ?? "";
|
|
if (!/\bicon\b/i.test(rel) && !/apple-touch-icon/i.test(rel)) continue;
|
|
const href = hrefRe.exec(tag)?.[1];
|
|
if (!href) continue;
|
|
const sizesStr = sizesRe.exec(tag)?.[1] ?? "";
|
|
const sm = sizesStr.match(/(\d+)x(\d+)/i);
|
|
const area = sm ? parseInt(sm[1]) * parseInt(sm[2]) : 0;
|
|
try {
|
|
candidates.push({ href: new URL(href, baseUrl).toString(), area });
|
|
} catch {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (candidates.length === 0) return undefined;
|
|
candidates.sort((a, b) => b.area - a.area);
|
|
return candidates[0].href;
|
|
}
|
|
|
|
/**
|
|
* Return the `src` of the first `<img>` that looks like content rather than UI
|
|
* chrome. Skips SVGs, data URIs, and images whose filename matches common
|
|
* icon/logo/nav patterns (logo, icon, sprite, favicon, avatar, banner, etc.).
|
|
* Resolves relative and protocol-relative URLs.
|
|
*/
|
|
const UI_IMAGE_KEYWORDS = new Set([
|
|
"logo",
|
|
"icon",
|
|
"sprite",
|
|
"favicon",
|
|
"avatar",
|
|
"banner",
|
|
"header",
|
|
"nav",
|
|
"menu",
|
|
"cart",
|
|
"search",
|
|
"tracking",
|
|
"pixel",
|
|
"bg",
|
|
"background",
|
|
]);
|
|
|
|
function isUiImage(src: string): boolean {
|
|
if (/\.svg(\?|$)/i.test(src)) return true;
|
|
const filename = src.split("?")[0].split("/").pop() ?? "";
|
|
const baseName = filename.replace(/\.[^.]+$/, ""); // strip extension
|
|
// Split on common filename separators (-, _, .) and check each token
|
|
return baseName.toLowerCase().split(/[-_.]/).some((t) =>
|
|
UI_IMAGE_KEYWORDS.has(t)
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Return the `src` of the first `<img>` that looks like content rather than UI
|
|
* chrome. Skips SVGs, data URIs, and images whose filename tokens match common
|
|
* icon/logo/nav patterns (logo, icon, sprite, etc.).
|
|
* Resolves relative and protocol-relative URLs.
|
|
*/
|
|
export function extractFirstContentImage(
|
|
html: string,
|
|
baseUrl: string,
|
|
): string | undefined {
|
|
const imgPattern = /<img[^>]+>/gi;
|
|
let match: RegExpExecArray | null;
|
|
while ((match = imgPattern.exec(html)) !== null) {
|
|
const tag = match[0];
|
|
const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
|
|
if (!src || src.startsWith("data:")) continue;
|
|
if (isUiImage(src)) continue;
|
|
try {
|
|
return new URL(src, baseUrl).toString();
|
|
} catch {
|
|
continue;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`,
|
|
* resolved to an absolute URL using `baseUrl`.
|
|
*/
|
|
export function extractLinkHref(
|
|
html: string,
|
|
relFragment: string,
|
|
baseUrl: string,
|
|
): string | undefined {
|
|
const escaped = relFragment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
const patterns = [
|
|
new RegExp(
|
|
`<link[^>]+rel=["'][^"']*${escaped}[^"']*["'][^>]+href=["']([^"']+)["']`,
|
|
"i",
|
|
),
|
|
new RegExp(
|
|
`<link[^>]+href=["']([^"']+)["'][^>]+rel=["'][^"']*${escaped}[^"']*["']`,
|
|
"i",
|
|
),
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = html.match(pattern);
|
|
if (match) {
|
|
try {
|
|
return new URL(match[1], baseUrl).toString();
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function isPrivateHost(hostname: string): boolean {
|
|
// Block loopback and RFC-1918 ranges. Note: DNS rebinding is not fully mitigated.
|
|
if (hostname === "localhost" || hostname === "::1") return true;
|
|
return /^(127\.|10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.)/.test(hostname);
|
|
}
|
|
|
|
const SELF_PATH_RE = /^\/(dumps|users|playlists)\/[^/]+$/;
|
|
|
|
export function isValidHttpUrl(raw: string): boolean {
|
|
try {
|
|
const u = new URL(raw);
|
|
if (u.protocol !== "http:" && u.protocol !== "https:") return false;
|
|
// Allow private hosts for self-referential gerbeur URLs — they are
|
|
// resolved directly from the DB by selfProvider, no outbound HTTP needed.
|
|
if (isPrivateHost(u.hostname) && !SELF_PATH_RE.test(u.pathname)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
export async function fetchRichContent(
|
|
url: string,
|
|
): Promise<RichContent | undefined> {
|
|
try {
|
|
const provider = providers.find((p) => p.matches(url))!;
|
|
return await provider.fetch(url);
|
|
} catch (err) {
|
|
console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err);
|
|
// Return a minimal stub so the caller always gets something displayable
|
|
// (e.g. when the site has a bad TLS cert or the fetch times out).
|
|
try {
|
|
return {
|
|
type: "generic",
|
|
url,
|
|
siteName: new URL(url).hostname.replace(/^www\./, ""),
|
|
};
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
}
|