gerbeur/api/services/rich-content-service.ts

import type { RichContent } from "../model/interfaces.ts";
import { youtubeProvider } from "./providers/youtube.ts";
import { bandcampProvider } from "./providers/bandcamp.ts";
import { soundcloudProvider } from "./providers/soundcloud.ts";
import { selfProvider } from "./providers/self.ts";
import { genericProvider } from "./providers/generic.ts";

export interface RichContentProvider {
  name: string;
  matches(url: string): boolean;
  fetch(url: string): Promise<RichContent>;
}

/**
 * Register providers in priority order. The first match wins.
 * `selfProvider` resolves gerbeur URLs directly from the DB (no HTTP round-trip).
 * `genericProvider` must stay last — it always matches.
 */
const providers: RichContentProvider[] = [
  youtubeProvider,
  bandcampProvider,
  soundcloudProvider,
  selfProvider,
  genericProvider,
];

// Shared utilities exported for use by providers

const FETCH_HEADERS = {
  "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
  "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
  "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
};

/**
 * Fetch `url` via a `curl --insecure` subprocess and return a minimal Response.
 * Used as a last resort when Deno's rustls rejects the server's TLS certificate
 * due to an unsupported algorithm (e.g. UnsupportedSignatureAlgorithm).
 * Returns null if curl is unavailable or exits non-zero.
 */
async function fetchViaCurl(
  url: string,
  timeoutMs: number,
): Promise<Response | null> {
  const tmpPath = await Deno.makeTempFile();
  try {
    const { code, stdout } = await new Deno.Command("curl", {
      args: [
        "--silent",
        "--insecure",
        "--location",
        "--max-time",
        String(Math.ceil(timeoutMs / 1000)),
        "--user-agent",
        FETCH_HEADERS["User-Agent"],
        "--header",
        `Accept: ${FETCH_HEADERS["Accept"]}`,
        "--header",
        `Accept-Language: ${FETCH_HEADERS["Accept-Language"]}`,
        "--output",
        tmpPath,
        "--write-out",
        "%{content_type}",
        url,
      ],
      stdout: "piped",
      stderr: "null",
    }).output();

    if (code !== 0) return null;

    const contentType = new TextDecoder().decode(stdout).trim();
    const bytes = await Deno.readFile(tmpPath);
    // Decode using the charset declared in the Content-Type header so that
    // pages served in ISO-8859-1, windows-1252, etc. are read correctly.
    const charset = /charset=([\w-]+)/i.exec(contentType)?.[1] ?? "utf-8";
    let bodyText: string;
    try {
      bodyText = new TextDecoder(charset, { fatal: false }).decode(bytes);
    } catch {
      bodyText = new TextDecoder("utf-8", { fatal: false }).decode(bytes);
    }
    return new Response(bodyText, { headers: { "content-type": "text/html" } });
  } catch {
    return null;
  } finally {
    await Deno.remove(tmpPath).catch(() => {});
  }
}

export async function fetchWithTimeout(
  url: string,
  timeoutMs = 5000,
): Promise<Response> {
  async function attempt(
    extraInit?: Record<string, unknown>,
  ): Promise<Response> {
    const controller = new AbortController();
    const timer = setTimeout(() => controller.abort(), timeoutMs);
    try {
      return await fetch(url, {
        signal: controller.signal,
        headers: FETCH_HEADERS,
        ...extraInit,
      } as RequestInit);
    } finally {
      clearTimeout(timer);
    }
  }

  try {
    return await attempt();
  } catch (err) {
    if (!(err instanceof TypeError && err.message.includes("certificate"))) {
      throw err;
    }

    // Retry 1: allowInsecureCertificates handles expired / self-signed certs.
    const client = Deno.createHttpClient({ allowInsecureCertificates: true });
    try {
      return await attempt({ client });
    } catch {
      /* UnsupportedSignatureAlgorithm etc. — rustls can't help */
    } finally {
      client.close();
    }

    // Retry 2: curl uses its own TLS stack and supports a wider set of
    // certificate algorithms that Deno/rustls rejects.
    const curlRes = await fetchViaCurl(url, timeoutMs);
    if (curlRes) return curlRes;

    throw err;
  }
}

function decodeHtmlEntities(str: string): string {
  return str
    .replace(/&amp;/gi, "&")
    .replace(/&lt;/gi, "<")
    .replace(/&gt;/gi, ">")
    .replace(/&quot;/gi, '"')
    .replace(/&apos;/gi, "'")
    .replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number(dec)))
    .replace(
      /&#x([0-9a-f]+);/gi,
      (_, hex) => String.fromCodePoint(parseInt(hex, 16)),
    );
}

export function extractOgTag(
  html: string,
  tag: string,
): string | undefined {
  const patterns = [
    new RegExp(
      `<meta[^>]+property=["']og:${tag}["'][^>]+content=["']([^"']+)["']`,
      "i",
    ),
    new RegExp(
      `<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:${tag}["']`,
      "i",
    ),
  ];
  for (const pattern of patterns) {
    const match = html.match(pattern);
    if (match) return decodeHtmlEntities(match[1]);
  }
  return undefined;
}

/** Extract content from `<meta name="…" content="…">` (both attribute orderings). */
export function extractMetaName(
  html: string,
  name: string,
): string | undefined {
  const escaped = name.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
  const patterns = [
    new RegExp(
      `<meta[^>]+name=["']${escaped}["'][^>]+content=["']([^"']+)["']`,
      "i",
    ),
    new RegExp(
      `<meta[^>]+content=["']([^"']+)["'][^>]+name=["']${escaped}["']`,
      "i",
    ),
  ];
  for (const pattern of patterns) {
    const match = html.match(pattern);
    if (match) return decodeHtmlEntities(match[1]);
  }
  return undefined;
}

/** Extract the text content of the `<title>` element. */
export function extractPageTitle(html: string): string | undefined {
  const match = html.match(/<title[^>]*>([^<]+)<\/title>/i);
  return match ? decodeHtmlEntities(match[1].trim()) : undefined;
}

// ── JSON-LD helpers (file-private) ────────────────────────────────────────────

type JsonLdResult = {
  title?: string;
  description?: string;
  thumbnailUrl?: string;
};

function ldString(v: unknown): string | undefined {
  if (typeof v === "string" && v.trim()) return v.trim();
  if (Array.isArray(v) && typeof v[0] === "string" && v[0].trim()) {
    return v[0].trim();
  }
  return undefined;
}

function ldImage(v: unknown): string | undefined {
  if (
    typeof v === "string" &&
    (v.startsWith("http://") || v.startsWith("https://"))
  ) return v;
  if (Array.isArray(v)) return ldImage(v[0]);
  if (v && typeof v === "object") {
    const o = v as Record<string, unknown>;
    return ldImage(o.url ?? o.contentUrl);
  }
  return undefined;
}

function ldExtractNode(data: unknown): JsonLdResult {
  if (Array.isArray(data)) {
    for (const item of data) {
      const r = ldExtractNode(item);
      if (r.title || r.thumbnailUrl) return r;
    }
    return {};
  }
  if (!data || typeof data !== "object") return {};
  const o = data as Record<string, unknown>;
  if (o["@graph"]) return ldExtractNode(o["@graph"]);
  return {
    title: ldString(o.name ?? o.headline),
    description: ldString(o.description),
    thumbnailUrl: ldImage(o.image ?? o.thumbnailUrl ?? o.thumbnail),
  };
}

/**
 * Parse every `<script type="application/ld+json">` block and return the first
 * node that yields a title or image. Handles `@graph`, arrays, and the common
 * `image` shapes (string, string[], ImageObject).
 */
export function extractJsonLd(html: string): JsonLdResult {
  const pattern =
    /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
  let match: RegExpExecArray | null;
  while ((match = pattern.exec(html)) !== null) {
    try {
      const result = ldExtractNode(JSON.parse(match[1]));
      if (result.title || result.thumbnailUrl) return result;
    } catch { /* invalid JSON — skip */ }
  }
  return {};
}

/**
 * Return the `src` of the first `<img>` whose declared width or height is at
 * least `minSize` pixels (default 200). Skips data URIs. Resolves relative URLs.
 */
export function extractLargeImage(
  html: string,
  baseUrl: string,
  minSize = 200,
): string | undefined {
  const imgPattern = /<img[^>]+>/gi;
  let match: RegExpExecArray | null;
  while ((match = imgPattern.exec(html)) !== null) {
    const tag = match[0];
    const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
    if (!src || src.startsWith("data:")) continue;
    const w = parseInt(/\bwidth=["']?(\d+)/i.exec(tag)?.[1] ?? "0");
    const h = parseInt(/\bheight=["']?(\d+)/i.exec(tag)?.[1] ?? "0");
    if (w >= minSize && h >= minSize) {
      try {
        return new URL(src, baseUrl).toString();
      } catch {
        continue;
      }
    }
  }
  return undefined;
}

/**
 * Collect all `<link rel="icon">` / `<link rel="apple-touch-icon">` tags, rank
 * them by declared size (largest wins), and return the best resolved URL.
 * Falls back to the first match when no `sizes` attribute is present.
 */
export function extractBestIcon(
  html: string,
  baseUrl: string,
): string | undefined {
  const linkRe = /<link[^>]+>/gi;
  const relRe = /\brel=["']([^"']+)["']/i;
  const hrefRe = /\bhref=["']([^"']+)["']/i;
  const sizesRe = /\bsizes=["']([^"']+)["']/i;

  const candidates: { href: string; area: number }[] = [];

  let m: RegExpExecArray | null;
  while ((m = linkRe.exec(html)) !== null) {
    const tag = m[0];
    const rel = relRe.exec(tag)?.[1] ?? "";
    if (!/\bicon\b/i.test(rel) && !/apple-touch-icon/i.test(rel)) continue;
    const href = hrefRe.exec(tag)?.[1];
    if (!href) continue;
    const sizesStr = sizesRe.exec(tag)?.[1] ?? "";
    const sm = sizesStr.match(/(\d+)x(\d+)/i);
    const area = sm ? parseInt(sm[1]) * parseInt(sm[2]) : 0;
    try {
      candidates.push({ href: new URL(href, baseUrl).toString(), area });
    } catch {
      continue;
    }
  }

  if (candidates.length === 0) return undefined;
  candidates.sort((a, b) => b.area - a.area);
  return candidates[0].href;
}

/**
 * Return the `src` of the first `<img>` that looks like content rather than UI
 * chrome. Skips SVGs, data URIs, and images whose filename matches common
 * icon/logo/nav patterns (logo, icon, sprite, favicon, avatar, banner, etc.).
 * Resolves relative and protocol-relative URLs.
 */
const UI_IMAGE_KEYWORDS = new Set([
  "logo",
  "icon",
  "sprite",
  "favicon",
  "avatar",
  "banner",
  "header",
  "nav",
  "menu",
  "cart",
  "search",
  "tracking",
  "pixel",
  "bg",
  "background",
]);

function isUiImage(src: string): boolean {
  if (/\.svg(\?|$)/i.test(src)) return true;
  const filename = src.split("?")[0].split("/").pop() ?? "";
  const baseName = filename.replace(/\.[^.]+$/, ""); // strip extension
  // Split on common filename separators (-, _, .) and check each token
  return baseName.toLowerCase().split(/[-_.]/).some((t) =>
    UI_IMAGE_KEYWORDS.has(t)
  );
}

/**
 * Return the `src` of the first `<img>` that looks like content rather than UI
 * chrome. Skips SVGs, data URIs, and images whose filename tokens match common
 * icon/logo/nav patterns (logo, icon, sprite, etc.).
 * Resolves relative and protocol-relative URLs.
 */
export function extractFirstContentImage(
  html: string,
  baseUrl: string,
): string | undefined {
  const imgPattern = /<img[^>]+>/gi;
  let match: RegExpExecArray | null;
  while ((match = imgPattern.exec(html)) !== null) {
    const tag = match[0];
    const src = /\bsrc=["']([^"']+)["']/i.exec(tag)?.[1];
    if (!src || src.startsWith("data:")) continue;
    if (isUiImage(src)) continue;
    try {
      return new URL(src, baseUrl).toString();
    } catch {
      continue;
    }
  }
  return undefined;
}

/**
 * Extract `href` from the first `<link rel="…">` whose rel contains `relFragment`,
 * resolved to an absolute URL using `baseUrl`.
 */
export function extractLinkHref(
  html: string,
  relFragment: string,
  baseUrl: string,
): string | undefined {
  const escaped = relFragment.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
  const patterns = [
    new RegExp(
      `<link[^>]+rel=["'][^"']*${escaped}[^"']*["'][^>]+href=["']([^"']+)["']`,
      "i",
    ),
    new RegExp(
      `<link[^>]+href=["']([^"']+)["'][^>]+rel=["'][^"']*${escaped}[^"']*["']`,
      "i",
    ),
  ];
  for (const pattern of patterns) {
    const match = html.match(pattern);
    if (match) {
      try {
        return new URL(match[1], baseUrl).toString();
      } catch {
        return undefined;
      }
    }
  }
  return undefined;
}

function isPrivateHost(hostname: string): boolean {
  // Block loopback and RFC-1918 ranges. Note: DNS rebinding is not fully mitigated.
  if (hostname === "localhost" || hostname === "::1") return true;
  return /^(127\.|10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.)/.test(hostname);
}

const SELF_PATH_RE = /^\/(dumps|users|playlists)\/[^/]+$/;

export function isValidHttpUrl(raw: string): boolean {
  try {
    const u = new URL(raw);
    if (u.protocol !== "http:" && u.protocol !== "https:") return false;
    // Allow private hosts for self-referential gerbeur URLs — they are
    // resolved directly from the DB by selfProvider, no outbound HTTP needed.
    if (isPrivateHost(u.hostname) && !SELF_PATH_RE.test(u.pathname)) {
      return false;
    }
    return true;
  } catch {
    return false;
  }
}

export async function fetchRichContent(
  url: string,
): Promise<RichContent | undefined> {
  try {
    const provider = providers.find((p) => p.matches(url))!;
    return await provider.fetch(url);
  } catch (err) {
    console.error(`[rich-content] Failed to fetch metadata for ${url}:`, err);
    // Return a minimal stub so the caller always gets something displayable
    // (e.g. when the site has a bad TLS cert or the fetch times out).
    try {
      return {
        type: "generic",
        url,
        siteName: new URL(url).hostname.replace(/^www\./, ""),
      };
    } catch {
      return undefined;
    }
  }
}