Files
gerbeur/api/services/rich-content-service.ts

313 lines
9.3 KiB
TypeScript

import type { RichContent } from "../model/interfaces.ts";
import { youtubeProvider } from "./providers/youtube.ts";
import { bandcampProvider } from "./providers/bandcamp.ts";
import { soundcloudProvider } from "./providers/soundcloud.ts";
import { selfProvider } from "./providers/self.ts";
import { genericProvider } from "./providers/generic.ts";
export interface RichContentProvider {
name: string;
matches(url: string): boolean;
fetch(url: string): Promise<RichContent>;
}
/**
* Register providers in priority order. The first match wins.
* `selfProvider` resolves gerbeur URLs directly from the DB (no HTTP round-trip).
* `genericProvider` must stay last — it always matches.
*/
const providers: RichContentProvider[] = [
youtubeProvider,
bandcampProvider,
soundcloudProvider,
selfProvider,
genericProvider,
];
// Shared utilities exported for use by providers
export async function fetchWithTimeout(
url: string,
timeoutMs = 5000,
): Promise<Response> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
return await fetch(url, {
signal: controller.signal,
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Accept":
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
},
});
} finally {
clearTimeout(timer);
}
}
function decodeHtmlEntities(str: string): string {
return str
.replace(/&amp;/gi, "&")
.replace(/&lt;/gi, "<")
.replace(/&gt;/gi, ">")
.replace(/&quot;/gi, '"')
.replace(/&apos;/gi, "'")
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number(dec)))
.replace(
/&#x([0-9a-f]+);/gi,
(_, hex) => String.fromCodePoint(parseInt(hex, 16)),
);
}
export function extractOgTag(
html: string,
tag: string,
): string | undefined {
const patterns = [
new RegExp(
`<meta[^>]+property=["']og:${tag}["'][^>]+content=["']([^"']+)["']`,
"i",
),
new RegExp(
`<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:${tag}["']`,
"i",
),
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (match) return decodeHtmlEntities(match[1]);
}
return undefined;
}
/** Extract content from `<meta name="…" content="…">` (both attribute orderings). */
export function extractMetaName(
html: string,
name: string,
): string | undefined {
const escaped = name.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const patterns = [
new RegExp(
`<meta[^>]+name=["']${escaped}["'][^>]+content=["']([^"']+)["']`,
"i",
),
new RegExp(
`<meta[^>]+content=["']([^"']+)["'][^>]+name=["']${escaped}["']`,
"i",
),
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (match) return decodeHtmlEntities(match[1]);
}
return undefined;
}
/** Extract the text content of the `<title>` element. */
export function extractPageTitle(html: string): string | undefined {
const match = html.match(/<title[^>]*>([^<]+)<\/title>/i);
return match ? decodeHtmlEntities(match[1].trim()) : undefined;
}
// ── JSON-LD helpers (file-private) ────────────────────────────────────────────
type JsonLdResult = {
title?: string;
description?: string;
thumbnailUrl?: string;
};
function ldString(v: unknown): string | undefined {
if (typeof v === "string" && v.trim()) return v.trim();
if (Array.isArray(v) && typeof v[0] === "string" && v[0].trim()) {
return v[0].trim();
}
return undefined;
}
function ldImage(v: unknown): string | undefined {
if (
typeof v === "string" &&
(v.startsWith("http://") || v.startsWith("https://"))
) return v;
if (Array.isArray(v)) return ldImage(v[0]);
if (v && typeof v === "object") {
const o = v as Record<string, unknown>;
return ldImage(o.url ?? o.contentUrl);
}
return undefined;
}
function ldExtractNode(data: unknown): JsonLdResult {
if (Array.isArray(data)) {
for (const item of data) {
const r = ldExtractNode(item);
if (r.title || r.thumbnailUrl) return r;
}
return {};
}
if (!data || typeof data !== "object") return {};
const o = data as Record<string, unknown>;
if (o["@graph"]) return ldExtractNode(o["@graph"]);
return {
title: ldString(o.name ?? o.headline),
description: ldString(o.description),
thumbnailUrl: ldImage(o.image ?? o.thumbnailUrl ?? o.thumbnail),
};
}
/**
* Parse every `<script type="application/ld+json">` block and return the first
* node that yields a title or image. Handles `@graph`, arrays, and the common
* `image` shapes (string, string[], ImageObject).
*/
export function extractJsonLd(html: string): JsonLdResult {
const pattern =
/<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
let match: RegExpExecArray | null;
while ((match = pattern.exec(html)) !== null) {
try {
const result = ldExtractNode(JSON.parse(match[1]));
if (result.title || result.thumbnailUrl) return result;
} catch { /* invalid JSON — skip */ }
}
return {};
}
/**
* Return the `src` of the first `<img>` whose declared width or height is at
* least `minSize` pixels (default 200). Skips data URIs. Resolves relative URLs.
*/
export function extractLargeImage(
html: string,
baseUrl: string,
minSize = 200,
): string | undefined {
const imgPattern = /<img[^>]+>/gi;
let match: RegExpExecArray | null;
while ((match = imgPattern.exec(html)) !== null) {
const tag = match[0];
const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
if (!src || src.startsWith("data:")) continue;
const w = parseInt(/\bwidth=["']?(\d+)/i.exec(tag)?.[1] ?? "0");
const h = parseInt(/\bheight=["']?(\d+)/i.exec(tag)?.[1] ?? "0");
if (w >= minSize && h >= minSize) {
try {
return new URL(src, baseUrl).toString();
} catch {
continue;
}
}
}
return undefined;
}
/**
* Collect all `<link rel="icon">` / `<link rel="apple-touch-icon">` tags, rank
* them by declared size (largest wins), and return the best resolved URL.
* Falls back to the first match when no `sizes` attribute is present.
*/
export function extractBestIcon(
html: string,
baseUrl: string,
): string | undefined {
const linkRe = /<link[^>]+>/gi;
const relRe = /\brel=["']([^"']+)["']/i;
const hrefRe = /\bhref=["']([^"']+)["']/i;
const sizesRe = /\bsizes=["']([^"']+)["']/i;
const candidates: { href: string; area: number }[] = [];
let m: RegExpExecArray | null;
while ((m = linkRe.exec(html)) !== null) {
const tag = m[0];
const rel = relRe.exec(tag)?.[1] ?? "";
if (!/\bicon\b/i.test(rel) && !/apple-touch-icon/i.test(rel)) continue;
const href = hrefRe.exec(tag)?.[1];
if (!href) continue;
const sizesStr = sizesRe.exec(tag)?.[1] ?? "";
const sm = sizesStr.match(/(\d+)x(\d+)/i);
const area = sm ? parseInt(sm[1]) * parseInt(sm[2]) : 0;
try {
candidates.push({ href: new URL(href, baseUrl).toString(), area });
} catch {
continue;
}
}
if (candidates.length === 0) return undefined;
candidates.sort((a, b) => b.area - a.area);
return candidates[0].href;
}
/**
* Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`,
* resolved to an absolute URL using `baseUrl`.
*/
export function extractLinkHref(
html: string,
relFragment: string,
baseUrl: string,
): string | undefined {
const escaped = relFragment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const patterns = [
new RegExp(
`<link[^>]+rel=["'][^"']*${escaped}[^"']*["'][^>]+href=["']([^"']+)["']`,
"i",
),
new RegExp(
`<link[^>]+href=["']([^"']+)["'][^>]+rel=["'][^"']*${escaped}[^"']*["']`,
"i",
),
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (match) {
try {
return new URL(match[1], baseUrl).toString();
} catch {
return undefined;
}
}
}
return undefined;
}
function isPrivateHost(hostname: string): boolean {
// Block loopback and RFC-1918 ranges. Note: DNS rebinding is not fully mitigated.
if (hostname === "localhost" || hostname === "::1") return true;
return /^(127\.|10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.)/.test(hostname);
}
const SELF_PATH_RE = /^\/(dumps|users|playlists)\/[^/]+$/;
export function isValidHttpUrl(raw: string): boolean {
try {
const u = new URL(raw);
if (u.protocol !== "http:" && u.protocol !== "https:") return false;
// Allow private hosts for self-referential gerbeur URLs — they are
// resolved directly from the DB by selfProvider, no outbound HTTP needed.
if (isPrivateHost(u.hostname) && !SELF_PATH_RE.test(u.pathname)) {
return false;
}
return true;
} catch {
return false;
}
}
export async function fetchRichContent(
url: string,
): Promise<RichContent | undefined> {
try {
const provider = providers.find((p) => p.matches(url))!;
return await provider.fetch(url);
} catch (err) {
console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err);
return undefined;
}
}