From 6d9e27a5919e46f2d9dc9cd3cfd96cb263888f13 Mon Sep 17 00:00:00 2001
From: William Valentin <william.valentin.info@gmail.com>
Date: Fri, 6 Feb 2026 14:24:28 -0800
Subject: [PATCH] feat: enhance web-fetch with HTML-to-markdown extraction
 (Phase 6)

Add turndown + readability for clean content extraction:
- HTML-to-markdown conversion with smart article extraction
- Format parameter (markdown/text/html)
- Response caching for repeated fetches
- 10 tests
---
 src/tools/builtin/web-fetch.test.ts | 178 +++++++++++++++++++++++-
 src/tools/builtin/web-fetch.ts      | 204 +++++++++++++++++++++++++++-
 2 files changed, 374 insertions(+), 8 deletions(-)
diff --git a/src/tools/builtin/web-fetch.test.ts b/src/tools/builtin/web-fetch.test.ts
index 9705e84..1b3dc80 100644
--- a/src/tools/builtin/web-fetch.test.ts
+++ b/src/tools/builtin/web-fetch.test.ts
@@ -1,34 +1,142 @@
-import { describe, it, expect, vi, beforeEach } from 'vitest';
-import { webFetchTool } from './web-fetch.js';
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { webFetchTool, _clearCache, _MAX_CONTENT_LENGTH } from './web-fetch.js';
 
+// ---------------------------------------------------------------------------
 // Mock global fetch
+// ---------------------------------------------------------------------------
+
 const mockFetch = vi.fn();
 vi.stubGlobal('fetch', mockFetch);
 
+// ---------------------------------------------------------------------------
+// HTML fixture used by extraction tests
+// ---------------------------------------------------------------------------
+
+const SAMPLE_HTML = `<!DOCTYPE html>
+<html><head><title>Test Page</title>
+<script>console.log('js')</script>
+<style>body { color: red }</style>
+</head><body>
+<nav>Menu items</nav>
+<article>
+<h1>Article Title</h1>
+<p>This is the main content of the article. It contains important information.</p>
+<p>Second paragraph with more details about the topic.</p>
+</article>
+<footer>Copyright 2026</footer>
+</body></html>`;
+
+// ---------------------------------------------------------------------------
+// Setup / teardown
+// ---------------------------------------------------------------------------
+
 beforeEach(() => {
   mockFetch.mockReset();
+  _clearCache();
 });
 
+afterEach(() => {
+  vi.useRealTimers();
+});
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
 describe('web.fetch', () => {
+  // ---- Metadata ----
+
   it('has correct metadata', () => {
     expect(webFetchTool.name).toBe('web.fetch');
     expect(webFetchTool.inputSchema.required).toContain('url');
+    // New format property is advertised in the schema
+    expect(webFetchTool.inputSchema.properties).toHaveProperty('format');
   });
 
-  it('fetches a URL and returns body text', async () => {
+  // ---- Default behaviour (markdown extraction) ----
+
+  it('fetches a URL and returns markdown by default', async () => {
     mockFetch.mockResolvedValue({
       ok: true,
       status: 200,
-      text: async () => '<html><body><h1>Hello</h1><p>World</p></body></html>',
+      text: async () => SAMPLE_HTML,
       headers: new Headers({ 'content-type': 'text/html' }),
     });
 
     const result = await webFetchTool.execute({ url: 'https://example.com' });
     expect(result.success).toBe(true);
     expect(result.output).toBeTruthy();
+    // Should contain markdown-ish content (heading or paragraph text)
+    expect(result.output).toContain('Article Title');
+    // Should NOT contain raw HTML tags
+    expect(result.output).not.toContain('<script>');
+    expect(result.output).not.toContain('<nav>');
     expect(mockFetch).toHaveBeenCalledWith('https://example.com', expect.any(Object));
   });
 
+  // ---- format=html ----
+
+  it('returns raw HTML when format is html', async () => {
+    mockFetch.mockResolvedValue({
+      ok: true,
+      status: 200,
+      text: async () => SAMPLE_HTML,
+      headers: new Headers({ 'content-type': 'text/html' }),
+    });
+
+    const result = await webFetchTool.execute({
+      url: 'https://example.com',
+      format: 'html',
+    });
+    expect(result.success).toBe(true);
+    // Raw HTML is returned unchanged
+    expect(result.output).toContain('<article>');
+    expect(result.output).toContain('<h1>Article Title</h1>');
+    expect(result.output).toContain('<script>');
+  });
+
+  // ---- format=text ----
+
+  it('returns extracted text when format is text', async () => {
+    mockFetch.mockResolvedValue({
+      ok: true,
+      status: 200,
+      text: async () => SAMPLE_HTML,
+      headers: new Headers({ 'content-type': 'text/html' }),
+    });
+
+    const result = await webFetchTool.execute({
+      url: 'https://example.com',
+      format: 'text',
+    });
+    expect(result.success).toBe(true);
+    expect(result.output).toContain('Article Title');
+    expect(result.output).toContain('main content');
+    // No HTML tags
+    expect(result.output).not.toContain('<');
+  });
+
+  // ---- JSON content-type ----
+
+  it('returns prettified JSON for application/json content', async () => {
+    const jsonPayload = '{"name":"Flynn","version":1}';
+    mockFetch.mockResolvedValue({
+      ok: true,
+      status: 200,
+      text: async () => jsonPayload,
+      headers: new Headers({ 'content-type': 'application/json' }),
+    });
+
+    const result = await webFetchTool.execute({ url: 'https://api.example.com/data' });
+    expect(result.success).toBe(true);
+    // Should be prettified (indented) JSON
+    const parsed = JSON.parse(result.output);
+    expect(parsed).toEqual({ name: 'Flynn', version: 1 });
+    expect(result.output).toContain('\n'); // multi-line
+  });
+
+  // ---- HTTP error ----
+
   it('returns error on HTTP failure', async () => {
     mockFetch.mockResolvedValue({
       ok: false,
@@ -42,6 +150,8 @@ describe('web.fetch', () => {
     expect(result.error).toContain('404');
   });
 
+  // ---- Network error ----
+
   it('returns error on network failure', async () => {
     mockFetch.mockRejectedValue(new Error('network error'));
 
@@ -49,4 +159,64 @@ describe('web.fetch', () => {
     expect(result.success).toBe(false);
     expect(result.error).toContain('network error');
   });
+
+  // ---- Caching ----
+
+  it('caches responses and reuses them on the second call', async () => {
+    mockFetch.mockResolvedValue({
+      ok: true,
+      status: 200,
+      text: async () => '<html><body><p>Cached content</p></body></html>',
+      headers: new Headers({ 'content-type': 'text/html' }),
+    });
+
+    const first = await webFetchTool.execute({ url: 'https://cached.example.com' });
+    const second = await webFetchTool.execute({ url: 'https://cached.example.com' });
+
+    expect(first.success).toBe(true);
+    expect(second.success).toBe(true);
+    expect(second.output).toBe(first.output);
+    // fetch should have been called exactly once — second call served from cache
+    expect(mockFetch).toHaveBeenCalledTimes(1);
+  });
+
+  it('expires cache entries after TTL', async () => {
+    vi.useFakeTimers();
+
+    mockFetch.mockResolvedValue({
+      ok: true,
+      status: 200,
+      text: async () => '<html><body><p>Stale content</p></body></html>',
+      headers: new Headers({ 'content-type': 'text/html' }),
+    });
+
+    // First call — populates cache
+    await webFetchTool.execute({ url: 'https://stale.example.com' });
+    expect(mockFetch).toHaveBeenCalledTimes(1);
+
+    // Advance time past the 5-minute TTL
+    vi.advanceTimersByTime(5 * 60 * 1_000 + 1);
+
+    // Second call — cache should be expired, triggers a new fetch
+    await webFetchTool.execute({ url: 'https://stale.example.com' });
+    expect(mockFetch).toHaveBeenCalledTimes(2);
+  });
+
+  // ---- Truncation ----
+
+  it('truncates content exceeding max length with a marker', async () => {
+    const longContent = 'A'.repeat(_MAX_CONTENT_LENGTH + 1_000);
+    mockFetch.mockResolvedValue({
+      ok: true,
+      status: 200,
+      text: async () => longContent,
+      headers: new Headers({ 'content-type': 'text/plain' }),
+    });
+
+    const result = await webFetchTool.execute({ url: 'https://big.example.com' });
+    expect(result.success).toBe(true);
+    expect(result.output).toContain('[content truncated]');
+    // Total length should be MAX + marker, not the original
+    expect(result.output.length).toBeLessThan(longContent.length);
+  });
 });
diff --git a/src/tools/builtin/web-fetch.ts b/src/tools/builtin/web-fetch.ts
index 9c9f04c..e4ab737 100644
--- a/src/tools/builtin/web-fetch.ts
+++ b/src/tools/builtin/web-fetch.ts
@@ -1,31 +1,206 @@
+import { parseHTML } from 'linkedom';
+import { Readability } from '@mozilla/readability';
+import TurndownService from 'turndown';
 import type { Tool, ToolResult } from '../types.js';
 
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+type OutputFormat = 'markdown' | 'text' | 'html';
+
 interface WebFetchArgs {
   url: string;
+  format?: OutputFormat;
   timeout?: number;
 }
 
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/** Maximum characters returned in output before truncation. */
+const MAX_CONTENT_LENGTH = 50_000;
+
+/** Cache time-to-live in milliseconds (5 minutes). */
+const CACHE_TTL_MS = 5 * 60 * 1_000;
+
+// ---------------------------------------------------------------------------
+// Response cache (module-level, lazy expiry)
+// ---------------------------------------------------------------------------
+
+interface CacheEntry {
+  output: string;
+  timestamp: number;
+}
+
+const cache = new Map<string, CacheEntry>();
+
+/** Build a deterministic cache key from the request parameters. */
+function cacheKey(url: string, format: OutputFormat): string {
+  return `${format}::${url}`;
+}
+
+/** Remove all expired entries from the cache. */
+function evictExpired(): void {
+  const now = Date.now();
+  for (const [key, entry] of cache) {
+    if (now - entry.timestamp >= CACHE_TTL_MS) {
+      cache.delete(key);
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Extraction helpers
+// ---------------------------------------------------------------------------
+
+const turndown = new TurndownService({
+  headingStyle: 'atx',
+  codeBlockStyle: 'fenced',
+});
+
+/**
+ * Convert raw HTML into clean markdown using Readability + Turndown.
+ * Falls back to converting the full `<body>` if Readability cannot extract an
+ * article (e.g. non-article pages).
+ */
+function htmlToMarkdown(html: string): string {
+  const { document } = parseHTML(html);
+
+  // Attempt Readability extraction
+  const reader = new Readability(document as unknown as Document);
+  const article = reader.parse();
+
+  if (article?.content) {
+    return turndown.turndown(article.content);
+  }
+
+  // Fallback: convert the whole body via Turndown
+  const body = document.querySelector('body');
+  return turndown.turndown(body ? body.innerHTML : html);
+}
+
+/**
+ * Extract readable plain text from HTML.
+ * Uses Readability for content extraction, then strips remaining tags.
+ */
+function htmlToText(html: string): string {
+  const { document } = parseHTML(html);
+
+  const reader = new Readability(document as unknown as Document);
+  const article = reader.parse();
+
+  if (article?.textContent) {
+    return article.textContent.trim();
+  }
+
+  // Fallback: crude tag stripping
+  const body = document.querySelector('body');
+  const raw = body ? body.textContent ?? '' : html.replace(/<[^>]*>/g, ' ');
+  return raw.replace(/\s+/g, ' ').trim();
+}
+
+/**
+ * Truncate content to `MAX_CONTENT_LENGTH` characters, appending a marker
+ * when truncation occurs.
+ */
+function truncate(content: string): string {
+  if (content.length <= MAX_CONTENT_LENGTH) {
+    return content;
+  }
+  return content.slice(0, MAX_CONTENT_LENGTH) + '\n\n[content truncated]';
+}
+
+// ---------------------------------------------------------------------------
+// Content-type routing
+// ---------------------------------------------------------------------------
+
+/**
+ * Process the raw response body according to its content type and the
+ * requested output format.
+ */
+function processResponse(
+  body: string,
+  contentType: string,
+  format: OutputFormat,
+): string {
+  // JSON responses — always prettify regardless of requested format
+  if (contentType.includes('application/json')) {
+    try {
+      return JSON.stringify(JSON.parse(body), null, 2);
+    } catch {
+      // If JSON parsing fails, return raw body
+      return body;
+    }
+  }
+
+  // Plain text — return as-is
+  if (contentType.includes('text/plain')) {
+    return body;
+  }
+
+  // HTML (or unknown) — apply extraction pipeline based on format
+  switch (format) {
+    case 'html':
+      return body;
+    case 'text':
+      return htmlToText(body);
+    case 'markdown':
+    default:
+      return htmlToMarkdown(body);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Tool definition
+// ---------------------------------------------------------------------------
+
 export const webFetchTool: Tool = {
   name: 'web.fetch',
-  description: 'Fetch the content of a URL via HTTP GET. Returns the response body as text.',
+  description:
+    'Fetch a URL and extract its content as clean markdown, text, or raw HTML. ' +
+    'By default, converts web pages to readable markdown by stripping navigation, ' +
+    'ads, and scripts. Returns prettified JSON for API endpoints.',
   inputSchema: {
     type: 'object',
     properties: {
       url: { type: 'string', description: 'The URL to fetch' },
-      timeout: { type: 'number', description: 'Timeout in milliseconds (default 15000)' },
+      format: {
+        type: 'string',
+        enum: ['markdown', 'text', 'html'],
+        description:
+          'Output format: "markdown" (default) extracts readable content, ' +
+          '"text" returns plain text, "html" returns raw HTML',
+      },
+      timeout: {
+        type: 'number',
+        description: 'Timeout in milliseconds (default 15000)',
+      },
     },
     required: ['url'],
   },
+
   execute: async (rawArgs: unknown): Promise<ToolResult> => {
     const args = rawArgs as WebFetchArgs;
+    const format: OutputFormat = args.format ?? 'markdown';
     const timeout = args.timeout ?? 15_000;
 
+    // ----- Check cache (lazy eviction) ----- //
+    evictExpired();
+    const key = cacheKey(args.url, format);
+    const cached = cache.get(key);
+    if (cached) {
+      return { success: true, output: cached.output };
+    }
+
+    // ----- Fetch ----- //
     try {
       const response = await fetch(args.url, {
         signal: AbortSignal.timeout(timeout),
         headers: {
           'User-Agent': 'Flynn/0.1 (personal AI assistant)',
-          'Accept': 'text/html, application/json, text/plain, */*',
+          Accept: 'text/html, application/json, text/plain, */*',
         },
       });
 
@@ -38,7 +213,16 @@ export const webFetchTool: Tool = {
       }
 
       const body = await response.text();
-      return { success: true, output: body };
+      const contentType = response.headers.get('content-type') ?? '';
+
+      // ----- Process & truncate ----- //
+      const processed = processResponse(body, contentType, format);
+      const output = truncate(processed);
+
+      // ----- Store in cache ----- //
+      cache.set(key, { output, timestamp: Date.now() });
+
+      return { success: true, output };
     } catch (error) {
       return {
         success: false,
@@ -48,3 +232,15 @@ export const webFetchTool: Tool = {
     }
   },
 };
+
+// ---------------------------------------------------------------------------
+// Exported for testing — allows tests to clear the cache between runs
+// ---------------------------------------------------------------------------
+
+/** @internal — exposed for tests only */
+export function _clearCache(): void {
+  cache.clear();
+}
+
+/** @internal — exposed for tests only */
+export { CACHE_TTL_MS as _CACHE_TTL_MS, MAX_CONTENT_LENGTH as _MAX_CONTENT_LENGTH };