6d9e27a591
Add turndown + readability for clean content extraction: - HTML-to-markdown conversion with smart article extraction - Format parameter (markdown/text/html) - Response caching for repeated fetches - 10 tests
223 lines
7.2 KiB
TypeScript
223 lines
7.2 KiB
TypeScript
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
import { webFetchTool, _clearCache, _MAX_CONTENT_LENGTH } from './web-fetch.js';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Mock global fetch
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const mockFetch = vi.fn();
|
|
vi.stubGlobal('fetch', mockFetch);
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// HTML fixture used by extraction tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const SAMPLE_HTML = `<!DOCTYPE html>
|
|
<html><head><title>Test Page</title>
|
|
<script>console.log('js')</script>
|
|
<style>body { color: red }</style>
|
|
</head><body>
|
|
<nav>Menu items</nav>
|
|
<article>
|
|
<h1>Article Title</h1>
|
|
<p>This is the main content of the article. It contains important information.</p>
|
|
<p>Second paragraph with more details about the topic.</p>
|
|
</article>
|
|
<footer>Copyright 2026</footer>
|
|
</body></html>`;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Setup / teardown
|
|
// ---------------------------------------------------------------------------
|
|
|
|
beforeEach(() => {
|
|
mockFetch.mockReset();
|
|
_clearCache();
|
|
});
|
|
|
|
afterEach(() => {
|
|
vi.useRealTimers();
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe('web.fetch', () => {
|
|
// ---- Metadata ----
|
|
|
|
it('has correct metadata', () => {
|
|
expect(webFetchTool.name).toBe('web.fetch');
|
|
expect(webFetchTool.inputSchema.required).toContain('url');
|
|
// New format property is advertised in the schema
|
|
expect(webFetchTool.inputSchema.properties).toHaveProperty('format');
|
|
});
|
|
|
|
// ---- Default behaviour (markdown extraction) ----
|
|
|
|
it('fetches a URL and returns markdown by default', async () => {
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
status: 200,
|
|
text: async () => SAMPLE_HTML,
|
|
headers: new Headers({ 'content-type': 'text/html' }),
|
|
});
|
|
|
|
const result = await webFetchTool.execute({ url: 'https://example.com' });
|
|
expect(result.success).toBe(true);
|
|
expect(result.output).toBeTruthy();
|
|
// Should contain markdown-ish content (heading or paragraph text)
|
|
expect(result.output).toContain('Article Title');
|
|
// Should NOT contain raw HTML tags
|
|
expect(result.output).not.toContain('<script>');
|
|
expect(result.output).not.toContain('<nav>');
|
|
expect(mockFetch).toHaveBeenCalledWith('https://example.com', expect.any(Object));
|
|
});
|
|
|
|
// ---- format=html ----
|
|
|
|
it('returns raw HTML when format is html', async () => {
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
status: 200,
|
|
text: async () => SAMPLE_HTML,
|
|
headers: new Headers({ 'content-type': 'text/html' }),
|
|
});
|
|
|
|
const result = await webFetchTool.execute({
|
|
url: 'https://example.com',
|
|
format: 'html',
|
|
});
|
|
expect(result.success).toBe(true);
|
|
// Raw HTML is returned unchanged
|
|
expect(result.output).toContain('<article>');
|
|
expect(result.output).toContain('<h1>Article Title</h1>');
|
|
expect(result.output).toContain('<script>');
|
|
});
|
|
|
|
// ---- format=text ----
|
|
|
|
it('returns extracted text when format is text', async () => {
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
status: 200,
|
|
text: async () => SAMPLE_HTML,
|
|
headers: new Headers({ 'content-type': 'text/html' }),
|
|
});
|
|
|
|
const result = await webFetchTool.execute({
|
|
url: 'https://example.com',
|
|
format: 'text',
|
|
});
|
|
expect(result.success).toBe(true);
|
|
expect(result.output).toContain('Article Title');
|
|
expect(result.output).toContain('main content');
|
|
// No HTML tags
|
|
expect(result.output).not.toContain('<');
|
|
});
|
|
|
|
// ---- JSON content-type ----
|
|
|
|
it('returns prettified JSON for application/json content', async () => {
|
|
const jsonPayload = '{"name":"Flynn","version":1}';
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
status: 200,
|
|
text: async () => jsonPayload,
|
|
headers: new Headers({ 'content-type': 'application/json' }),
|
|
});
|
|
|
|
const result = await webFetchTool.execute({ url: 'https://api.example.com/data' });
|
|
expect(result.success).toBe(true);
|
|
// Should be prettified (indented) JSON
|
|
const parsed = JSON.parse(result.output);
|
|
expect(parsed).toEqual({ name: 'Flynn', version: 1 });
|
|
expect(result.output).toContain('\n'); // multi-line
|
|
});
|
|
|
|
// ---- HTTP error ----
|
|
|
|
it('returns error on HTTP failure', async () => {
|
|
mockFetch.mockResolvedValue({
|
|
ok: false,
|
|
status: 404,
|
|
text: async () => 'Not Found',
|
|
headers: new Headers(),
|
|
});
|
|
|
|
const result = await webFetchTool.execute({ url: 'https://example.com/nope' });
|
|
expect(result.success).toBe(false);
|
|
expect(result.error).toContain('404');
|
|
});
|
|
|
|
// ---- Network error ----
|
|
|
|
it('returns error on network failure', async () => {
|
|
mockFetch.mockRejectedValue(new Error('network error'));
|
|
|
|
const result = await webFetchTool.execute({ url: 'https://down.example.com' });
|
|
expect(result.success).toBe(false);
|
|
expect(result.error).toContain('network error');
|
|
});
|
|
|
|
// ---- Caching ----
|
|
|
|
it('caches responses and reuses them on the second call', async () => {
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
status: 200,
|
|
text: async () => '<html><body><p>Cached content</p></body></html>',
|
|
headers: new Headers({ 'content-type': 'text/html' }),
|
|
});
|
|
|
|
const first = await webFetchTool.execute({ url: 'https://cached.example.com' });
|
|
const second = await webFetchTool.execute({ url: 'https://cached.example.com' });
|
|
|
|
expect(first.success).toBe(true);
|
|
expect(second.success).toBe(true);
|
|
expect(second.output).toBe(first.output);
|
|
// fetch should have been called exactly once — second call served from cache
|
|
expect(mockFetch).toHaveBeenCalledTimes(1);
|
|
});
|
|
|
|
it('expires cache entries after TTL', async () => {
|
|
vi.useFakeTimers();
|
|
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
status: 200,
|
|
text: async () => '<html><body><p>Stale content</p></body></html>',
|
|
headers: new Headers({ 'content-type': 'text/html' }),
|
|
});
|
|
|
|
// First call — populates cache
|
|
await webFetchTool.execute({ url: 'https://stale.example.com' });
|
|
expect(mockFetch).toHaveBeenCalledTimes(1);
|
|
|
|
// Advance time past the 5-minute TTL
|
|
vi.advanceTimersByTime(5 * 60 * 1_000 + 1);
|
|
|
|
// Second call — cache should be expired, triggers a new fetch
|
|
await webFetchTool.execute({ url: 'https://stale.example.com' });
|
|
expect(mockFetch).toHaveBeenCalledTimes(2);
|
|
});
|
|
|
|
// ---- Truncation ----
|
|
|
|
it('truncates content exceeding max length with a marker', async () => {
|
|
const longContent = 'A'.repeat(_MAX_CONTENT_LENGTH + 1_000);
|
|
mockFetch.mockResolvedValue({
|
|
ok: true,
|
|
status: 200,
|
|
text: async () => longContent,
|
|
headers: new Headers({ 'content-type': 'text/plain' }),
|
|
});
|
|
|
|
const result = await webFetchTool.execute({ url: 'https://big.example.com' });
|
|
expect(result.success).toBe(true);
|
|
expect(result.output).toContain('[content truncated]');
|
|
// Total length should be MAX + marker, not the original
|
|
expect(result.output.length).toBeLessThan(longContent.length);
|
|
});
|
|
});
|