feat: enhance web-fetch with HTML-to-markdown extraction (Phase 6)
Add turndown + readability for clean content extraction: - HTML-to-markdown conversion with smart article extraction - Format parameter (markdown/text/html) - Response caching for repeated fetches - 10 tests
This commit is contained in:
@@ -1,34 +1,142 @@
|
|||||||
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
||||||
import { webFetchTool } from './web-fetch.js';
|
import { webFetchTool, _clearCache, _MAX_CONTENT_LENGTH } from './web-fetch.js';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
// Mock global fetch
|
// Mock global fetch
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
const mockFetch = vi.fn();
|
const mockFetch = vi.fn();
|
||||||
vi.stubGlobal('fetch', mockFetch);
|
vi.stubGlobal('fetch', mockFetch);
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// HTML fixture used by extraction tests
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const SAMPLE_HTML = `<!DOCTYPE html>
|
||||||
|
<html><head><title>Test Page</title>
|
||||||
|
<script>console.log('js')</script>
|
||||||
|
<style>body { color: red }</style>
|
||||||
|
</head><body>
|
||||||
|
<nav>Menu items</nav>
|
||||||
|
<article>
|
||||||
|
<h1>Article Title</h1>
|
||||||
|
<p>This is the main content of the article. It contains important information.</p>
|
||||||
|
<p>Second paragraph with more details about the topic.</p>
|
||||||
|
</article>
|
||||||
|
<footer>Copyright 2026</footer>
|
||||||
|
</body></html>`;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Setup / teardown
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
mockFetch.mockReset();
|
mockFetch.mockReset();
|
||||||
|
_clearCache();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
vi.useRealTimers();
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Tests
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
describe('web.fetch', () => {
|
describe('web.fetch', () => {
|
||||||
|
// ---- Metadata ----
|
||||||
|
|
||||||
it('has correct metadata', () => {
|
it('has correct metadata', () => {
|
||||||
expect(webFetchTool.name).toBe('web.fetch');
|
expect(webFetchTool.name).toBe('web.fetch');
|
||||||
expect(webFetchTool.inputSchema.required).toContain('url');
|
expect(webFetchTool.inputSchema.required).toContain('url');
|
||||||
|
// New format property is advertised in the schema
|
||||||
|
expect(webFetchTool.inputSchema.properties).toHaveProperty('format');
|
||||||
});
|
});
|
||||||
|
|
||||||
it('fetches a URL and returns body text', async () => {
|
// ---- Default behaviour (markdown extraction) ----
|
||||||
|
|
||||||
|
it('fetches a URL and returns markdown by default', async () => {
|
||||||
mockFetch.mockResolvedValue({
|
mockFetch.mockResolvedValue({
|
||||||
ok: true,
|
ok: true,
|
||||||
status: 200,
|
status: 200,
|
||||||
text: async () => '<html><body><h1>Hello</h1><p>World</p></body></html>',
|
text: async () => SAMPLE_HTML,
|
||||||
headers: new Headers({ 'content-type': 'text/html' }),
|
headers: new Headers({ 'content-type': 'text/html' }),
|
||||||
});
|
});
|
||||||
|
|
||||||
const result = await webFetchTool.execute({ url: 'https://example.com' });
|
const result = await webFetchTool.execute({ url: 'https://example.com' });
|
||||||
expect(result.success).toBe(true);
|
expect(result.success).toBe(true);
|
||||||
expect(result.output).toBeTruthy();
|
expect(result.output).toBeTruthy();
|
||||||
|
// Should contain markdown-ish content (heading or paragraph text)
|
||||||
|
expect(result.output).toContain('Article Title');
|
||||||
|
// Should NOT contain raw HTML tags
|
||||||
|
expect(result.output).not.toContain('<script>');
|
||||||
|
expect(result.output).not.toContain('<nav>');
|
||||||
expect(mockFetch).toHaveBeenCalledWith('https://example.com', expect.any(Object));
|
expect(mockFetch).toHaveBeenCalledWith('https://example.com', expect.any(Object));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ---- format=html ----
|
||||||
|
|
||||||
|
it('returns raw HTML when format is html', async () => {
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
text: async () => SAMPLE_HTML,
|
||||||
|
headers: new Headers({ 'content-type': 'text/html' }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await webFetchTool.execute({
|
||||||
|
url: 'https://example.com',
|
||||||
|
format: 'html',
|
||||||
|
});
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
// Raw HTML is returned unchanged
|
||||||
|
expect(result.output).toContain('<article>');
|
||||||
|
expect(result.output).toContain('<h1>Article Title</h1>');
|
||||||
|
expect(result.output).toContain('<script>');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---- format=text ----
|
||||||
|
|
||||||
|
it('returns extracted text when format is text', async () => {
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
text: async () => SAMPLE_HTML,
|
||||||
|
headers: new Headers({ 'content-type': 'text/html' }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await webFetchTool.execute({
|
||||||
|
url: 'https://example.com',
|
||||||
|
format: 'text',
|
||||||
|
});
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.output).toContain('Article Title');
|
||||||
|
expect(result.output).toContain('main content');
|
||||||
|
// No HTML tags
|
||||||
|
expect(result.output).not.toContain('<');
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---- JSON content-type ----
|
||||||
|
|
||||||
|
it('returns prettified JSON for application/json content', async () => {
|
||||||
|
const jsonPayload = '{"name":"Flynn","version":1}';
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
text: async () => jsonPayload,
|
||||||
|
headers: new Headers({ 'content-type': 'application/json' }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await webFetchTool.execute({ url: 'https://api.example.com/data' });
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
// Should be prettified (indented) JSON
|
||||||
|
const parsed = JSON.parse(result.output);
|
||||||
|
expect(parsed).toEqual({ name: 'Flynn', version: 1 });
|
||||||
|
expect(result.output).toContain('\n'); // multi-line
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---- HTTP error ----
|
||||||
|
|
||||||
it('returns error on HTTP failure', async () => {
|
it('returns error on HTTP failure', async () => {
|
||||||
mockFetch.mockResolvedValue({
|
mockFetch.mockResolvedValue({
|
||||||
ok: false,
|
ok: false,
|
||||||
@@ -42,6 +150,8 @@ describe('web.fetch', () => {
|
|||||||
expect(result.error).toContain('404');
|
expect(result.error).toContain('404');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ---- Network error ----
|
||||||
|
|
||||||
it('returns error on network failure', async () => {
|
it('returns error on network failure', async () => {
|
||||||
mockFetch.mockRejectedValue(new Error('network error'));
|
mockFetch.mockRejectedValue(new Error('network error'));
|
||||||
|
|
||||||
@@ -49,4 +159,64 @@ describe('web.fetch', () => {
|
|||||||
expect(result.success).toBe(false);
|
expect(result.success).toBe(false);
|
||||||
expect(result.error).toContain('network error');
|
expect(result.error).toContain('network error');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ---- Caching ----
|
||||||
|
|
||||||
|
it('caches responses and reuses them on the second call', async () => {
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
text: async () => '<html><body><p>Cached content</p></body></html>',
|
||||||
|
headers: new Headers({ 'content-type': 'text/html' }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const first = await webFetchTool.execute({ url: 'https://cached.example.com' });
|
||||||
|
const second = await webFetchTool.execute({ url: 'https://cached.example.com' });
|
||||||
|
|
||||||
|
expect(first.success).toBe(true);
|
||||||
|
expect(second.success).toBe(true);
|
||||||
|
expect(second.output).toBe(first.output);
|
||||||
|
// fetch should have been called exactly once — second call served from cache
|
||||||
|
expect(mockFetch).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('expires cache entries after TTL', async () => {
|
||||||
|
vi.useFakeTimers();
|
||||||
|
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
text: async () => '<html><body><p>Stale content</p></body></html>',
|
||||||
|
headers: new Headers({ 'content-type': 'text/html' }),
|
||||||
|
});
|
||||||
|
|
||||||
|
// First call — populates cache
|
||||||
|
await webFetchTool.execute({ url: 'https://stale.example.com' });
|
||||||
|
expect(mockFetch).toHaveBeenCalledTimes(1);
|
||||||
|
|
||||||
|
// Advance time past the 5-minute TTL
|
||||||
|
vi.advanceTimersByTime(5 * 60 * 1_000 + 1);
|
||||||
|
|
||||||
|
// Second call — cache should be expired, triggers a new fetch
|
||||||
|
await webFetchTool.execute({ url: 'https://stale.example.com' });
|
||||||
|
expect(mockFetch).toHaveBeenCalledTimes(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---- Truncation ----
|
||||||
|
|
||||||
|
it('truncates content exceeding max length with a marker', async () => {
|
||||||
|
const longContent = 'A'.repeat(_MAX_CONTENT_LENGTH + 1_000);
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
text: async () => longContent,
|
||||||
|
headers: new Headers({ 'content-type': 'text/plain' }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await webFetchTool.execute({ url: 'https://big.example.com' });
|
||||||
|
expect(result.success).toBe(true);
|
||||||
|
expect(result.output).toContain('[content truncated]');
|
||||||
|
// Total length should be MAX + marker, not the original
|
||||||
|
expect(result.output.length).toBeLessThan(longContent.length);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,31 +1,206 @@
|
|||||||
|
import { parseHTML } from 'linkedom';
|
||||||
|
import { Readability } from '@mozilla/readability';
|
||||||
|
import TurndownService from 'turndown';
|
||||||
import type { Tool, ToolResult } from '../types.js';
|
import type { Tool, ToolResult } from '../types.js';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
type OutputFormat = 'markdown' | 'text' | 'html';
|
||||||
|
|
||||||
interface WebFetchArgs {
|
interface WebFetchArgs {
|
||||||
url: string;
|
url: string;
|
||||||
|
format?: OutputFormat;
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Constants
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/** Maximum characters returned in output before truncation. */
|
||||||
|
const MAX_CONTENT_LENGTH = 50_000;
|
||||||
|
|
||||||
|
/** Cache time-to-live in milliseconds (5 minutes). */
|
||||||
|
const CACHE_TTL_MS = 5 * 60 * 1_000;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Response cache (module-level, lazy expiry)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
interface CacheEntry {
|
||||||
|
output: string;
|
||||||
|
timestamp: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cache = new Map<string, CacheEntry>();
|
||||||
|
|
||||||
|
/** Build a deterministic cache key from the request parameters. */
|
||||||
|
function cacheKey(url: string, format: OutputFormat): string {
|
||||||
|
return `${format}::${url}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Remove all expired entries from the cache. */
|
||||||
|
function evictExpired(): void {
|
||||||
|
const now = Date.now();
|
||||||
|
for (const [key, entry] of cache) {
|
||||||
|
if (now - entry.timestamp >= CACHE_TTL_MS) {
|
||||||
|
cache.delete(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Extraction helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const turndown = new TurndownService({
|
||||||
|
headingStyle: 'atx',
|
||||||
|
codeBlockStyle: 'fenced',
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert raw HTML into clean markdown using Readability + Turndown.
|
||||||
|
* Falls back to converting the full `<body>` if Readability cannot extract an
|
||||||
|
* article (e.g. non-article pages).
|
||||||
|
*/
|
||||||
|
function htmlToMarkdown(html: string): string {
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
|
// Attempt Readability extraction
|
||||||
|
const reader = new Readability(document as unknown as Document);
|
||||||
|
const article = reader.parse();
|
||||||
|
|
||||||
|
if (article?.content) {
|
||||||
|
return turndown.turndown(article.content);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: convert the whole body via Turndown
|
||||||
|
const body = document.querySelector('body');
|
||||||
|
return turndown.turndown(body ? body.innerHTML : html);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract readable plain text from HTML.
|
||||||
|
* Uses Readability for content extraction, then strips remaining tags.
|
||||||
|
*/
|
||||||
|
function htmlToText(html: string): string {
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
|
||||||
|
const reader = new Readability(document as unknown as Document);
|
||||||
|
const article = reader.parse();
|
||||||
|
|
||||||
|
if (article?.textContent) {
|
||||||
|
return article.textContent.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: crude tag stripping
|
||||||
|
const body = document.querySelector('body');
|
||||||
|
const raw = body ? body.textContent ?? '' : html.replace(/<[^>]*>/g, ' ');
|
||||||
|
return raw.replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Truncate content to `MAX_CONTENT_LENGTH` characters, appending a marker
|
||||||
|
* when truncation occurs.
|
||||||
|
*/
|
||||||
|
function truncate(content: string): string {
|
||||||
|
if (content.length <= MAX_CONTENT_LENGTH) {
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
return content.slice(0, MAX_CONTENT_LENGTH) + '\n\n[content truncated]';
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Content-type routing
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process the raw response body according to its content type and the
|
||||||
|
* requested output format.
|
||||||
|
*/
|
||||||
|
function processResponse(
|
||||||
|
body: string,
|
||||||
|
contentType: string,
|
||||||
|
format: OutputFormat,
|
||||||
|
): string {
|
||||||
|
// JSON responses — always prettify regardless of requested format
|
||||||
|
if (contentType.includes('application/json')) {
|
||||||
|
try {
|
||||||
|
return JSON.stringify(JSON.parse(body), null, 2);
|
||||||
|
} catch {
|
||||||
|
// If JSON parsing fails, return raw body
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plain text — return as-is
|
||||||
|
if (contentType.includes('text/plain')) {
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML (or unknown) — apply extraction pipeline based on format
|
||||||
|
switch (format) {
|
||||||
|
case 'html':
|
||||||
|
return body;
|
||||||
|
case 'text':
|
||||||
|
return htmlToText(body);
|
||||||
|
case 'markdown':
|
||||||
|
default:
|
||||||
|
return htmlToMarkdown(body);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Tool definition
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export const webFetchTool: Tool = {
|
export const webFetchTool: Tool = {
|
||||||
name: 'web.fetch',
|
name: 'web.fetch',
|
||||||
description: 'Fetch the content of a URL via HTTP GET. Returns the response body as text.',
|
description:
|
||||||
|
'Fetch a URL and extract its content as clean markdown, text, or raw HTML. ' +
|
||||||
|
'By default, converts web pages to readable markdown by stripping navigation, ' +
|
||||||
|
'ads, and scripts. Returns prettified JSON for API endpoints.',
|
||||||
inputSchema: {
|
inputSchema: {
|
||||||
type: 'object',
|
type: 'object',
|
||||||
properties: {
|
properties: {
|
||||||
url: { type: 'string', description: 'The URL to fetch' },
|
url: { type: 'string', description: 'The URL to fetch' },
|
||||||
timeout: { type: 'number', description: 'Timeout in milliseconds (default 15000)' },
|
format: {
|
||||||
|
type: 'string',
|
||||||
|
enum: ['markdown', 'text', 'html'],
|
||||||
|
description:
|
||||||
|
'Output format: "markdown" (default) extracts readable content, ' +
|
||||||
|
'"text" returns plain text, "html" returns raw HTML',
|
||||||
|
},
|
||||||
|
timeout: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Timeout in milliseconds (default 15000)',
|
||||||
|
},
|
||||||
},
|
},
|
||||||
required: ['url'],
|
required: ['url'],
|
||||||
},
|
},
|
||||||
|
|
||||||
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
execute: async (rawArgs: unknown): Promise<ToolResult> => {
|
||||||
const args = rawArgs as WebFetchArgs;
|
const args = rawArgs as WebFetchArgs;
|
||||||
|
const format: OutputFormat = args.format ?? 'markdown';
|
||||||
const timeout = args.timeout ?? 15_000;
|
const timeout = args.timeout ?? 15_000;
|
||||||
|
|
||||||
|
// ----- Check cache (lazy eviction) ----- //
|
||||||
|
evictExpired();
|
||||||
|
const key = cacheKey(args.url, format);
|
||||||
|
const cached = cache.get(key);
|
||||||
|
if (cached) {
|
||||||
|
return { success: true, output: cached.output };
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----- Fetch ----- //
|
||||||
try {
|
try {
|
||||||
const response = await fetch(args.url, {
|
const response = await fetch(args.url, {
|
||||||
signal: AbortSignal.timeout(timeout),
|
signal: AbortSignal.timeout(timeout),
|
||||||
headers: {
|
headers: {
|
||||||
'User-Agent': 'Flynn/0.1 (personal AI assistant)',
|
'User-Agent': 'Flynn/0.1 (personal AI assistant)',
|
||||||
'Accept': 'text/html, application/json, text/plain, */*',
|
Accept: 'text/html, application/json, text/plain, */*',
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -38,7 +213,16 @@ export const webFetchTool: Tool = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const body = await response.text();
|
const body = await response.text();
|
||||||
return { success: true, output: body };
|
const contentType = response.headers.get('content-type') ?? '';
|
||||||
|
|
||||||
|
// ----- Process & truncate ----- //
|
||||||
|
const processed = processResponse(body, contentType, format);
|
||||||
|
const output = truncate(processed);
|
||||||
|
|
||||||
|
// ----- Store in cache ----- //
|
||||||
|
cache.set(key, { output, timestamp: Date.now() });
|
||||||
|
|
||||||
|
return { success: true, output };
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
@@ -48,3 +232,15 @@ export const webFetchTool: Tool = {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Exported for testing — allows tests to clear the cache between runs
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/** @internal — exposed for tests only */
|
||||||
|
export function _clearCache(): void {
|
||||||
|
cache.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @internal — exposed for tests only */
|
||||||
|
export { CACHE_TTL_MS as _CACHE_TTL_MS, MAX_CONTENT_LENGTH as _MAX_CONTENT_LENGTH };
|
||||||
|
|||||||
Reference in New Issue
Block a user