import type { RichContent } from "../model/interfaces.ts"; import { youtubeProvider } from "./providers/youtube.ts"; import { bandcampProvider } from "./providers/bandcamp.ts"; import { soundcloudProvider } from "./providers/soundcloud.ts"; import { selfProvider } from "./providers/self.ts"; import { genericProvider } from "./providers/generic.ts"; export interface RichContentProvider { name: string; matches(url: string): boolean; fetch(url: string): Promise; } /** * Register providers in priority order. The first match wins. * `selfProvider` resolves gerbeur URLs directly from the DB (no HTTP round-trip). * `genericProvider` must stay last — it always matches. */ const providers: RichContentProvider[] = [ youtubeProvider, bandcampProvider, soundcloudProvider, selfProvider, genericProvider, ]; // Shared utilities exported for use by providers export async function fetchWithTimeout( url: string, timeoutMs = 5000, ): Promise { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); try { return await fetch(url, { signal: controller.signal, headers: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7", }, }); } finally { clearTimeout(timer); } } function decodeHtmlEntities(str: string): string { return str .replace(/&/gi, "&") .replace(/</gi, "<") .replace(/>/gi, ">") .replace(/"/gi, '"') .replace(/'/gi, "'") .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number(dec))) .replace( /&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(parseInt(hex, 16)), ); } export function extractOgTag( html: string, tag: string, ): string | undefined { const patterns = [ new RegExp( `]+property=["']og:${tag}["'][^>]+content=["']([^"']+)["']`, "i", ), new RegExp( `]+content=["']([^"']+)["'][^>]+property=["']og:${tag}["']`, "i", ), ]; for (const pattern of patterns) { const match = html.match(pattern); if (match) return decodeHtmlEntities(match[1]); } return undefined; } /** Extract content from `` (both attribute orderings). */ export function extractMetaName( html: string, name: string, ): string | undefined { const escaped = name.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); const patterns = [ new RegExp( `]+name=["']${escaped}["'][^>]+content=["']([^"']+)["']`, "i", ), new RegExp( `]+content=["']([^"']+)["'][^>]+name=["']${escaped}["']`, "i", ), ]; for (const pattern of patterns) { const match = html.match(pattern); if (match) return decodeHtmlEntities(match[1]); } return undefined; } /** Extract the text content of the `` element. */ export function extractPageTitle(html: string): string | undefined { const match = html.match(/<title[^>]*>([^<]+)<\/title>/i); return match ? decodeHtmlEntities(match[1].trim()) : undefined; } // ── JSON-LD helpers (file-private) ──────────────────────────────────────────── type JsonLdResult = { title?: string; description?: string; thumbnailUrl?: string; }; function ldString(v: unknown): string | undefined { if (typeof v === "string" && v.trim()) return v.trim(); if (Array.isArray(v) && typeof v[0] === "string" && v[0].trim()) { return v[0].trim(); } return undefined; } function ldImage(v: unknown): string | undefined { if ( typeof v === "string" && (v.startsWith("http://") || v.startsWith("https://")) ) return v; if (Array.isArray(v)) return ldImage(v[0]); if (v && typeof v === "object") { const o = v as Record<string, unknown>; return ldImage(o.url ?? o.contentUrl); } return undefined; } function ldExtractNode(data: unknown): JsonLdResult { if (Array.isArray(data)) { for (const item of data) { const r = ldExtractNode(item); if (r.title || r.thumbnailUrl) return r; } return {}; } if (!data || typeof data !== "object") return {}; const o = data as Record<string, unknown>; if (o["@graph"]) return ldExtractNode(o["@graph"]); return { title: ldString(o.name ?? o.headline), description: ldString(o.description), thumbnailUrl: ldImage(o.image ?? o.thumbnailUrl ?? o.thumbnail), }; } /** * Parse every `<script type="application/ld+json">` block and return the first * node that yields a title or image. Handles `@graph`, arrays, and the common * `image` shapes (string, string[], ImageObject). */ export function extractJsonLd(html: string): JsonLdResult { const pattern = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; let match: RegExpExecArray | null; while ((match = pattern.exec(html)) !== null) { try { const result = ldExtractNode(JSON.parse(match[1])); if (result.title || result.thumbnailUrl) return result; } catch { /* invalid JSON — skip */ } } return {}; } /** * Return the `src` of the first `<img>` whose declared width or height is at * least `minSize` pixels (default 200). Skips data URIs. Resolves relative URLs. */ export function extractLargeImage( html: string, baseUrl: string, minSize = 200, ): string | undefined { const imgPattern = /<img[^>]+>/gi; let match: RegExpExecArray | null; while ((match = imgPattern.exec(html)) !== null) { const tag = match[0]; const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1]; if (!src || src.startsWith("data:")) continue; const w = parseInt(/\bwidth=["']?(\d+)/i.exec(tag)?.[1] ?? "0"); const h = parseInt(/\bheight=["']?(\d+)/i.exec(tag)?.[1] ?? "0"); if (w >= minSize && h >= minSize) { try { return new URL(src, baseUrl).toString(); } catch { continue; } } } return undefined; } /** * Collect all `<link rel="icon">` / `<link rel="apple-touch-icon">` tags, rank * them by declared size (largest wins), and return the best resolved URL. * Falls back to the first match when no `sizes` attribute is present. */ export function extractBestIcon( html: string, baseUrl: string, ): string | undefined { const linkRe = /<link[^>]+>/gi; const relRe = /\brel=["']([^"']+)["']/i; const hrefRe = /\bhref=["']([^"']+)["']/i; const sizesRe = /\bsizes=["']([^"']+)["']/i; const candidates: { href: string; area: number }[] = []; let m: RegExpExecArray | null; while ((m = linkRe.exec(html)) !== null) { const tag = m[0]; const rel = relRe.exec(tag)?.[1] ?? ""; if (!/\bicon\b/i.test(rel) && !/apple-touch-icon/i.test(rel)) continue; const href = hrefRe.exec(tag)?.[1]; if (!href) continue; const sizesStr = sizesRe.exec(tag)?.[1] ?? ""; const sm = sizesStr.match(/(\d+)x(\d+)/i); const area = sm ? parseInt(sm[1]) * parseInt(sm[2]) : 0; try { candidates.push({ href: new URL(href, baseUrl).toString(), area }); } catch { continue; } } if (candidates.length === 0) return undefined; candidates.sort((a, b) => b.area - a.area); return candidates[0].href; } /** * Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`, * resolved to an absolute URL using `baseUrl`. */ export function extractLinkHref( html: string, relFragment: string, baseUrl: string, ): string | undefined { const escaped = relFragment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); const patterns = [ new RegExp( `<link[^>]+rel=["'][^"']*${escaped}[^"']*["'][^>]+href=["']([^"']+)["']`, "i", ), new RegExp( `<link[^>]+href=["']([^"']+)["'][^>]+rel=["'][^"']*${escaped}[^"']*["']`, "i", ), ]; for (const pattern of patterns) { const match = html.match(pattern); if (match) { try { return new URL(match[1], baseUrl).toString(); } catch { return undefined; } } } return undefined; } function isPrivateHost(hostname: string): boolean { // Block loopback and RFC-1918 ranges. Note: DNS rebinding is not fully mitigated. if (hostname === "localhost" || hostname === "::1") return true; return /^(127\.|10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.)/.test(hostname); } const SELF_PATH_RE = /^\/(dumps|users|playlists)\/[^/]+$/; export function isValidHttpUrl(raw: string): boolean { try { const u = new URL(raw); if (u.protocol !== "http:" && u.protocol !== "https:") return false; // Allow private hosts for self-referential gerbeur URLs — they are // resolved directly from the DB by selfProvider, no outbound HTTP needed. if (isPrivateHost(u.hostname) && !SELF_PATH_RE.test(u.pathname)) { return false; } return true; } catch { return false; } } export async function fetchRichContent( url: string, ): Promise<RichContent | undefined> { try { const provider = providers.find((p) => p.matches(url))!; return await provider.fetch(url); } catch (err) { console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err); return undefined; } }