import { parseHTML } from 'linkedom'; import { Readability } from '@mozilla/readability'; import TurndownService from 'turndown'; import type { Tool, ToolResult } from '../types.js'; // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- type OutputFormat = 'markdown' | 'text' | 'html'; interface WebFetchArgs { url: string; format?: OutputFormat; timeout?: number; } // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- /** Maximum characters returned in output before truncation. */ const MAX_CONTENT_LENGTH = 50_000; /** Cache time-to-live in milliseconds (5 minutes). */ const CACHE_TTL_MS = 5 * 60 * 1_000; // --------------------------------------------------------------------------- // Response cache (module-level, lazy expiry) // --------------------------------------------------------------------------- interface CacheEntry { output: string; timestamp: number; } const cache = new Map(); /** Build a deterministic cache key from the request parameters. */ function cacheKey(url: string, format: OutputFormat): string { return `${format}::${url}`; } /** Remove all expired entries from the cache. */ function evictExpired(): void { const now = Date.now(); for (const [key, entry] of cache) { if (now - entry.timestamp >= CACHE_TTL_MS) { cache.delete(key); } } } // --------------------------------------------------------------------------- // Extraction helpers // --------------------------------------------------------------------------- const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced', }); /** * Convert raw HTML into clean markdown using Readability + Turndown. * Falls back to converting the full `` if Readability cannot extract an * article (e.g. non-article pages). */ function htmlToMarkdown(html: string): string { const { document } = parseHTML(html); // Attempt Readability extraction const reader = new Readability(document as unknown as Document); const article = reader.parse(); if (article?.content) { return turndown.turndown(article.content); } // Fallback: convert the whole body via Turndown const body = document.querySelector('body'); return turndown.turndown(body ? body.innerHTML : html); } /** * Extract readable plain text from HTML. * Uses Readability for content extraction, then strips remaining tags. */ function htmlToText(html: string): string { const { document } = parseHTML(html); const reader = new Readability(document as unknown as Document); const article = reader.parse(); if (article?.textContent) { return article.textContent.trim(); } // Fallback: crude tag stripping const body = document.querySelector('body'); const raw = body ? body.textContent ?? '' : html.replace(/<[^>]*>/g, ' '); return raw.replace(/\s+/g, ' ').trim(); } /** * Truncate content to `MAX_CONTENT_LENGTH` characters, appending a marker * when truncation occurs. */ function truncate(content: string): string { if (content.length <= MAX_CONTENT_LENGTH) { return content; } return content.slice(0, MAX_CONTENT_LENGTH) + '\n\n[content truncated]'; } // --------------------------------------------------------------------------- // Content-type routing // --------------------------------------------------------------------------- /** * Process the raw response body according to its content type and the * requested output format. */ function processResponse( body: string, contentType: string, format: OutputFormat, ): string { // JSON responses — always prettify regardless of requested format if (contentType.includes('application/json')) { try { return JSON.stringify(JSON.parse(body), null, 2); } catch { // If JSON parsing fails, return raw body return body; } } // Plain text — return as-is if (contentType.includes('text/plain')) { return body; } // HTML (or unknown) — apply extraction pipeline based on format switch (format) { case 'html': return body; case 'text': return htmlToText(body); case 'markdown': default: return htmlToMarkdown(body); } } // --------------------------------------------------------------------------- // Tool definition // --------------------------------------------------------------------------- export const webFetchTool: Tool = { name: 'web.fetch', description: 'Fetch a URL and extract its content as clean markdown, text, or raw HTML. ' + 'By default, converts web pages to readable markdown by stripping navigation, ' + 'ads, and scripts. Returns prettified JSON for API endpoints.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL to fetch' }, format: { type: 'string', enum: ['markdown', 'text', 'html'], description: 'Output format: "markdown" (default) extracts readable content, ' + '"text" returns plain text, "html" returns raw HTML', }, timeout: { type: 'number', description: 'Timeout in milliseconds (default 15000)', }, }, required: ['url'], }, execute: async (rawArgs: unknown): Promise => { const args = rawArgs as WebFetchArgs; const format: OutputFormat = args.format ?? 'markdown'; const timeout = args.timeout ?? 15_000; // ----- Check cache (lazy eviction) ----- // evictExpired(); const key = cacheKey(args.url, format); const cached = cache.get(key); if (cached) { return { success: true, output: cached.output }; } // ----- Fetch ----- // try { const response = await fetch(args.url, { signal: AbortSignal.timeout(timeout), headers: { 'User-Agent': 'Flynn/0.1 (personal AI assistant)', Accept: 'text/html, application/json, text/plain, */*', }, }); if (!response.ok) { return { success: false, output: '', error: `HTTP ${response.status}: ${await response.text()}`, }; } const body = await response.text(); const contentType = response.headers.get('content-type') ?? ''; // ----- Process & truncate ----- // const processed = processResponse(body, contentType, format); const output = truncate(processed); // ----- Store in cache ----- // cache.set(key, { output, timestamp: Date.now() }); return { success: true, output }; } catch (error) { return { success: false, output: '', error: error instanceof Error ? error.message : String(error), }; } }, }; // --------------------------------------------------------------------------- // Exported for testing — allows tests to clear the cache between runs // --------------------------------------------------------------------------- /** @internal — exposed for tests only */ export function _clearCache(): void { cache.clear(); } /** @internal — exposed for tests only */ export { CACHE_TTL_MS as _CACHE_TTL_MS, MAX_CONTENT_LENGTH as _MAX_CONTENT_LENGTH };