feat: enhance web-fetch with HTML-to-markdown extraction (Phase 6)
Add turndown + readability for clean content extraction: - HTML-to-markdown conversion with smart article extraction - Format parameter (markdown/text/html) - Response caching for repeated fetches - 10 tests
This commit is contained in:
@@ -1,31 +1,206 @@
|
||||
import { parseHTML } from 'linkedom';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import TurndownService from 'turndown';
|
||||
import type { Tool, ToolResult } from '../types.js';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type OutputFormat = 'markdown' | 'text' | 'html';
|
||||
|
||||
interface WebFetchArgs {
|
||||
url: string;
|
||||
format?: OutputFormat;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Maximum characters returned in output before truncation. */
|
||||
const MAX_CONTENT_LENGTH = 50_000;
|
||||
|
||||
/** Cache time-to-live in milliseconds (5 minutes). */
|
||||
const CACHE_TTL_MS = 5 * 60 * 1_000;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Response cache (module-level, lazy expiry)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface CacheEntry {
|
||||
output: string;
|
||||
timestamp: number;
|
||||
}
|
||||
|
||||
const cache = new Map<string, CacheEntry>();
|
||||
|
||||
/** Build a deterministic cache key from the request parameters. */
|
||||
function cacheKey(url: string, format: OutputFormat): string {
|
||||
return `${format}::${url}`;
|
||||
}
|
||||
|
||||
/** Remove all expired entries from the cache. */
|
||||
function evictExpired(): void {
|
||||
const now = Date.now();
|
||||
for (const [key, entry] of cache) {
|
||||
if (now - entry.timestamp >= CACHE_TTL_MS) {
|
||||
cache.delete(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Extraction helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const turndown = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
codeBlockStyle: 'fenced',
|
||||
});
|
||||
|
||||
/**
|
||||
* Convert raw HTML into clean markdown using Readability + Turndown.
|
||||
* Falls back to converting the full `<body>` if Readability cannot extract an
|
||||
* article (e.g. non-article pages).
|
||||
*/
|
||||
function htmlToMarkdown(html: string): string {
|
||||
const { document } = parseHTML(html);
|
||||
|
||||
// Attempt Readability extraction
|
||||
const reader = new Readability(document as unknown as Document);
|
||||
const article = reader.parse();
|
||||
|
||||
if (article?.content) {
|
||||
return turndown.turndown(article.content);
|
||||
}
|
||||
|
||||
// Fallback: convert the whole body via Turndown
|
||||
const body = document.querySelector('body');
|
||||
return turndown.turndown(body ? body.innerHTML : html);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract readable plain text from HTML.
|
||||
* Uses Readability for content extraction, then strips remaining tags.
|
||||
*/
|
||||
function htmlToText(html: string): string {
|
||||
const { document } = parseHTML(html);
|
||||
|
||||
const reader = new Readability(document as unknown as Document);
|
||||
const article = reader.parse();
|
||||
|
||||
if (article?.textContent) {
|
||||
return article.textContent.trim();
|
||||
}
|
||||
|
||||
// Fallback: crude tag stripping
|
||||
const body = document.querySelector('body');
|
||||
const raw = body ? body.textContent ?? '' : html.replace(/<[^>]*>/g, ' ');
|
||||
return raw.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate content to `MAX_CONTENT_LENGTH` characters, appending a marker
|
||||
* when truncation occurs.
|
||||
*/
|
||||
function truncate(content: string): string {
|
||||
if (content.length <= MAX_CONTENT_LENGTH) {
|
||||
return content;
|
||||
}
|
||||
return content.slice(0, MAX_CONTENT_LENGTH) + '\n\n[content truncated]';
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Content-type routing
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Process the raw response body according to its content type and the
|
||||
* requested output format.
|
||||
*/
|
||||
function processResponse(
|
||||
body: string,
|
||||
contentType: string,
|
||||
format: OutputFormat,
|
||||
): string {
|
||||
// JSON responses — always prettify regardless of requested format
|
||||
if (contentType.includes('application/json')) {
|
||||
try {
|
||||
return JSON.stringify(JSON.parse(body), null, 2);
|
||||
} catch {
|
||||
// If JSON parsing fails, return raw body
|
||||
return body;
|
||||
}
|
||||
}
|
||||
|
||||
// Plain text — return as-is
|
||||
if (contentType.includes('text/plain')) {
|
||||
return body;
|
||||
}
|
||||
|
||||
// HTML (or unknown) — apply extraction pipeline based on format
|
||||
switch (format) {
|
||||
case 'html':
|
||||
return body;
|
||||
case 'text':
|
||||
return htmlToText(body);
|
||||
case 'markdown':
|
||||
default:
|
||||
return htmlToMarkdown(body);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tool definition
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const webFetchTool: Tool = {
|
||||
name: 'web.fetch',
|
||||
description: 'Fetch the content of a URL via HTTP GET. Returns the response body as text.',
|
||||
description:
|
||||
'Fetch a URL and extract its content as clean markdown, text, or raw HTML. ' +
|
||||
'By default, converts web pages to readable markdown by stripping navigation, ' +
|
||||
'ads, and scripts. Returns prettified JSON for API endpoints.',
|
||||
inputSchema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
url: { type: 'string', description: 'The URL to fetch' },
|
||||
timeout: { type: 'number', description: 'Timeout in milliseconds (default 15000)' },
|
||||
format: {
|
||||
type: 'string',
|
||||
enum: ['markdown', 'text', 'html'],
|
||||
description:
|
||||
'Output format: "markdown" (default) extracts readable content, ' +
|
||||
'"text" returns plain text, "html" returns raw HTML',
|
||||
},
|
||||
timeout: {
|
||||
type: 'number',
|
||||
description: 'Timeout in milliseconds (default 15000)',
|
||||
},
|
||||
},
|
||||
required: ['url'],
|
||||
},
|
||||
|
||||
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
||||
const args = rawArgs as WebFetchArgs;
|
||||
const format: OutputFormat = args.format ?? 'markdown';
|
||||
const timeout = args.timeout ?? 15_000;
|
||||
|
||||
// ----- Check cache (lazy eviction) ----- //
|
||||
evictExpired();
|
||||
const key = cacheKey(args.url, format);
|
||||
const cached = cache.get(key);
|
||||
if (cached) {
|
||||
return { success: true, output: cached.output };
|
||||
}
|
||||
|
||||
// ----- Fetch ----- //
|
||||
try {
|
||||
const response = await fetch(args.url, {
|
||||
signal: AbortSignal.timeout(timeout),
|
||||
headers: {
|
||||
'User-Agent': 'Flynn/0.1 (personal AI assistant)',
|
||||
'Accept': 'text/html, application/json, text/plain, */*',
|
||||
Accept: 'text/html, application/json, text/plain, */*',
|
||||
},
|
||||
});
|
||||
|
||||
@@ -38,7 +213,16 @@ export const webFetchTool: Tool = {
|
||||
}
|
||||
|
||||
const body = await response.text();
|
||||
return { success: true, output: body };
|
||||
const contentType = response.headers.get('content-type') ?? '';
|
||||
|
||||
// ----- Process & truncate ----- //
|
||||
const processed = processResponse(body, contentType, format);
|
||||
const output = truncate(processed);
|
||||
|
||||
// ----- Store in cache ----- //
|
||||
cache.set(key, { output, timestamp: Date.now() });
|
||||
|
||||
return { success: true, output };
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
@@ -48,3 +232,15 @@ export const webFetchTool: Tool = {
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Exported for testing — allows tests to clear the cache between runs
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** @internal — exposed for tests only */
|
||||
export function _clearCache(): void {
|
||||
cache.clear();
|
||||
}
|
||||
|
||||
/** @internal — exposed for tests only */
|
||||
export { CACHE_TTL_MS as _CACHE_TTL_MS, MAX_CONTENT_LENGTH as _MAX_CONTENT_LENGTH };
|
||||
|
||||
Reference in New Issue
Block a user