flynn/src/tools/builtin/web-fetch.ts

import { parseHTML } from 'linkedom';
import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown';
import type { Tool, ToolResult } from '../types.js';

// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------

type OutputFormat = 'markdown' | 'text' | 'html';

interface WebFetchArgs {
  url: string;
  format?: OutputFormat;
  timeout?: number;
}

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/** Maximum characters returned in output before truncation. */
const MAX_CONTENT_LENGTH = 50_000;

/** Cache time-to-live in milliseconds (5 minutes). */
const CACHE_TTL_MS = 5 * 60 * 1_000;

// ---------------------------------------------------------------------------
// Response cache (module-level, lazy expiry)
// ---------------------------------------------------------------------------

interface CacheEntry {
  output: string;
  timestamp: number;
}

const cache = new Map<string, CacheEntry>();

/** Build a deterministic cache key from the request parameters. */
function cacheKey(url: string, format: OutputFormat): string {
  return `${format}::${url}`;
}

/** Remove all expired entries from the cache. */
function evictExpired(): void {
  const now = Date.now();
  for (const [key, entry] of cache) {
    if (now - entry.timestamp >= CACHE_TTL_MS) {
      cache.delete(key);
    }
  }
}

// ---------------------------------------------------------------------------
// Extraction helpers
// ---------------------------------------------------------------------------

const turndown = new TurndownService({
  headingStyle: 'atx',
  codeBlockStyle: 'fenced',
});

/**
 * Convert raw HTML into clean markdown using Readability + Turndown.
 * Falls back to converting the full `<body>` if Readability cannot extract an
 * article (e.g. non-article pages).
 */
function htmlToMarkdown(html: string): string {
  const { document } = parseHTML(html);

  // Attempt Readability extraction
  const reader = new Readability(document as unknown as Document);
  const article = reader.parse();

  if (article?.content) {
    return turndown.turndown(article.content);
  }

  // Fallback: convert the whole body via Turndown
  const body = document.querySelector('body');
  return turndown.turndown(body ? body.innerHTML : html);
}

/**
 * Extract readable plain text from HTML.
 * Uses Readability for content extraction, then strips remaining tags.
 */
function htmlToText(html: string): string {
  const { document } = parseHTML(html);

  const reader = new Readability(document as unknown as Document);
  const article = reader.parse();

  if (article?.textContent) {
    return article.textContent.trim();
  }

  // Fallback: crude tag stripping
  const body = document.querySelector('body');
  const raw = body ? body.textContent ?? '' : html.replace(/<[^>]*>/g, ' ');
  return raw.replace(/\s+/g, ' ').trim();
}

/**
 * Truncate content to `MAX_CONTENT_LENGTH` characters, appending a marker
 * when truncation occurs.
 */
function truncate(content: string): string {
  if (content.length <= MAX_CONTENT_LENGTH) {
    return content;
  }
  return content.slice(0, MAX_CONTENT_LENGTH) + '\n\n[content truncated]';
}

// ---------------------------------------------------------------------------
// Content-type routing
// ---------------------------------------------------------------------------

/**
 * Process the raw response body according to its content type and the
 * requested output format.
 */
function processResponse(
  body: string,
  contentType: string,
  format: OutputFormat,
): string {
  // JSON responses — always prettify regardless of requested format
  if (contentType.includes('application/json')) {
    try {
      return JSON.stringify(JSON.parse(body), null, 2);
    } catch {
      // If JSON parsing fails, return raw body
      return body;
    }
  }

  // Plain text — return as-is
  if (contentType.includes('text/plain')) {
    return body;
  }

  // HTML (or unknown) — apply extraction pipeline based on format
  switch (format) {
    case 'html':
      return body;
    case 'text':
      return htmlToText(body);
    case 'markdown':
    default:
      return htmlToMarkdown(body);
  }
}

// ---------------------------------------------------------------------------
// Tool definition
// ---------------------------------------------------------------------------

export const webFetchTool: Tool = {
  name: 'web.fetch',
  description:
    'Fetch a URL and extract its content as clean markdown, text, or raw HTML. ' +
    'By default, converts web pages to readable markdown by stripping navigation, ' +
    'ads, and scripts. Returns prettified JSON for API endpoints.',
  inputSchema: {
    type: 'object',
    properties: {
      url: { type: 'string', description: 'The URL to fetch' },
      format: {
        type: 'string',
        enum: ['markdown', 'text', 'html'],
        description:
          'Output format: "markdown" (default) extracts readable content, ' +
          '"text" returns plain text, "html" returns raw HTML',
      },
      timeout: {
        type: 'number',
        description: 'Timeout in milliseconds (default 15000)',
      },
    },
    required: ['url'],
  },

  execute: async (rawArgs: unknown): Promise<ToolResult> => {
    const args = rawArgs as WebFetchArgs;
    const format: OutputFormat = args.format ?? 'markdown';
    const timeout = args.timeout ?? 15_000;

    // ----- Check cache (lazy eviction) ----- //
    evictExpired();
    const key = cacheKey(args.url, format);
    const cached = cache.get(key);
    if (cached) {
      return { success: true, output: cached.output };
    }

    // ----- Fetch ----- //
    try {
      const response = await fetch(args.url, {
        signal: AbortSignal.timeout(timeout),
        headers: {
          'User-Agent': 'Flynn/0.1 (personal AI assistant)',
          Accept: 'text/html, application/json, text/plain, */*',
        },
      });

      if (!response.ok) {
        return {
          success: false,
          output: '',
          error: `HTTP ${response.status}: ${await response.text()}`,
        };
      }

      const body = await response.text();
      const contentType = response.headers.get('content-type') ?? '';

      // ----- Process & truncate ----- //
      const processed = processResponse(body, contentType, format);
      const output = truncate(processed);

      // ----- Store in cache ----- //
      cache.set(key, { output, timestamp: Date.now() });

      return { success: true, output };
    } catch (error) {
      return {
        success: false,
        output: '',
        error: error instanceof Error ? error.message : String(error),
      };
    }
  },
};

// ---------------------------------------------------------------------------
// Exported for testing — allows tests to clear the cache between runs
// ---------------------------------------------------------------------------

/** @internal — exposed for tests only */
export function _clearCache(): void {
  cache.clear();
}

/** @internal — exposed for tests only */
export { CACHE_TTL_MS as _CACHE_TTL_MS, MAX_CONTENT_LENGTH as _MAX_CONTENT_LENGTH };