From 1e6f6bb5a4f67adcb4e86e1d283c6bfdccb589a8 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Sat, 7 Feb 2026 09:08:53 -0800 Subject: [PATCH] feat: add image.analyze tool for vision model analysis Provides a factory createImageAnalyzeTool(modelClient) that sends images to a vision-capable model and returns a textual analysis. Includes 15 tests covering base64, URL, multi-image, error, and edge cases. --- src/tools/builtin/image-analyze.test.ts | 306 ++++++++++++++++++++++++ src/tools/builtin/image-analyze.ts | 121 ++++++++++ 2 files changed, 427 insertions(+) create mode 100644 src/tools/builtin/image-analyze.test.ts create mode 100644 src/tools/builtin/image-analyze.ts diff --git a/src/tools/builtin/image-analyze.test.ts b/src/tools/builtin/image-analyze.test.ts new file mode 100644 index 0000000..62f80a1 --- /dev/null +++ b/src/tools/builtin/image-analyze.test.ts @@ -0,0 +1,306 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import type { ModelClient, ChatRequest, ChatResponse } from '../../models/types.js'; +import { createImageAnalyzeTool } from './image-analyze.js'; + +describe('image.analyze tool', () => { + let mockClient: ModelClient & { chat: ReturnType }; + + beforeEach(() => { + mockClient = { + chat: vi.fn() + }; + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + it('has correct metadata', () => { + const tool = createImageAnalyzeTool(mockClient); + expect(tool.name).toBe('image.analyze'); + expect(tool.inputSchema.required).toHaveLength(0); + }); + + it('analyzes image from URL', async () => { + mockClient.chat = vi.fn().mockResolvedValueOnce({ + content: 'This is a beautiful sunset over the ocean.', + stopReason: 'end_turn', + usage: { inputTokens: 100, outputTokens: 50 } + }); + + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ url: 'https://example.com/image.jpg' }); + + expect(result.success).toBe(true); + expect(result.output).toBe('This is a beautiful sunset over the ocean.'); + expect(mockClient.chat).toHaveBeenCalledTimes(1); + expect(mockClient.chat).toHaveBeenCalledWith( + expect.objectContaining({ + messages: expect.arrayContaining([ + expect.objectContaining({ + role: 'user', + content: expect.arrayContaining([ + { type: 'text', text: 'Describe this image in detail.' }, + { + type: 'image', + source: expect.objectContaining({ + type: 'url', + media_type: 'image/jpeg', + url: 'https://example.com/image.jpg' + }) + } + ]) + }) + ]), + system: expect.stringContaining('vision assistant'), + maxTokens: 1024 + }) + ); + }); + + it('analyzes image from base64 data', async () => { + const base64Data = 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=='; + mockClient.chat = vi.fn().mockResolvedValueOnce({ + content: 'This is a sample image.', + stopReason: 'end_turn', + usage: { inputTokens: 100, outputTokens: 20 } + }); + + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ + data: base64Data, + media_type: 'image/png' + }); + + expect(result.success).toBe(true); + expect(result.output).toBe('This is a sample image.'); + expect(mockClient.chat).toHaveBeenCalledTimes(1); + expect(mockClient.chat).toHaveBeenCalledWith( + expect.objectContaining({ + messages: expect.arrayContaining([ + expect.objectContaining({ + role: 'user', + content: expect.arrayContaining([ + { type: 'text', text: 'Describe this image in detail.' }, + { + type: 'image', + source: expect.objectContaining({ + type: 'base64', + media_type: 'image/png', + data: base64Data + }) + } + ]) + }) + ]), + system: expect.stringContaining('vision assistant'), + maxTokens: 1024 + }) + ); + }); + + it('uses custom prompt', async () => { + mockClient.chat = vi.fn().mockResolvedValueOnce({ + content: 'The image shows a cat sitting on a mat.', + stopReason: 'end_turn', + usage: { inputTokens: 100, outputTokens: 30 } + }); + + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ + url: 'https://example.com/cat.jpg', + prompt: 'What is in this image?' + }); + + expect(result.success).toBe(true); + expect(result.output).toBe('The image shows a cat sitting on a mat.'); + expect(mockClient.chat).toHaveBeenCalledWith( + expect.objectContaining({ + messages: expect.arrayContaining([ + expect.objectContaining({ + content: expect.arrayContaining([ + { type: 'text', text: 'What is in this image?' }, + expect.any(Object) + ]) + }) + ]) + }) + ); + }); + + it('defaults prompt to "Describe this image in detail."', async () => { + mockClient.chat = vi.fn().mockResolvedValueOnce({ + content: 'This is the default prompt response.', + stopReason: 'end_turn', + usage: { inputTokens: 100, outputTokens: 10 } + }); + + const tool = createImageAnalyzeTool(mockClient); + await tool.execute({ url: 'https://example.com/image.jpg' }); + + expect(mockClient.chat).toHaveBeenCalledWith( + expect.objectContaining({ + messages: expect.arrayContaining([ + expect.objectContaining({ + content: expect.arrayContaining([ + { type: 'text', text: 'Describe this image in detail.' }, + expect.any(Object) + ]) + }) + ]) + }) + ); + }); + + it('fails when neither url nor data is provided', async () => { + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({}); + + expect(result.success).toBe(false); + expect(result.error).toContain('Either "url" or "data" must be provided'); + expect(mockClient.chat).not.toHaveBeenCalled(); + }); + + it('fails when both url and data are provided', async () => { + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ + url: 'https://example.com/image.jpg', + data: 'base64data' + }); + + expect(result.success).toBe(false); + expect(result.error).toContain('Cannot provide both "url" and "data"'); + expect(mockClient.chat).not.toHaveBeenCalled(); + }); + + it('fails when data is provided without media_type', async () => { + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ + data: 'base64data', + prompt: 'Test' + }); + + expect(result.success).toBe(false); + expect(result.error).toContain('media_type is required when providing data'); + expect(mockClient.chat).not.toHaveBeenCalled(); + }); + + it('fails with invalid media_type', async () => { + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ + data: 'base64data', + media_type: 'image/tiff' + }); + + expect(result.success).toBe(false); + expect(result.error).toContain('Invalid media_type'); + expect(mockClient.chat).not.toHaveBeenCalled(); + }); + + it('passes valid media_types', async () => { + const validTypes = ['image/jpeg', 'image/png', 'image/gif', 'image/webp']; + + for (const mediaType of validTypes) { + mockClient.chat = vi.fn().mockResolvedValueOnce({ + content: 'Success', + stopReason: 'end_turn', + usage: { inputTokens: 10, outputTokens: 10 } + }); + + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ + data: 'base64data', + media_type: mediaType + }); + + expect(result.success).toBe(true); + expect(mockClient.chat).toHaveBeenCalledTimes(1); + mockClient.chat = vi.fn(); + } + }); + + it('handles model client errors', async () => { + mockClient.chat = vi.fn().mockRejectedValueOnce(new Error('Model API error')); + + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ url: 'https://example.com/image.jpg' }); + + expect(result.success).toBe(false); + expect(result.output).toBe(''); + expect(result.error).toBe('Model API error'); + }); + + it('handles non-Error exceptions', async () => { + mockClient.chat = vi.fn().mockRejectedValueOnce('String error'); + + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ url: 'https://example.com/image.jpg' }); + + expect(result.success).toBe(false); + expect(result.output).toBe(''); + expect(result.error).toBe('String error'); + }); + + it('uses custom system message prompt', async () => { + const mockRequest = { + messages: [] as any, + system: '', + maxTokens: 1024 + }; + const mockResponse = { + content: 'Analysis complete.', + stopReason: 'end_turn', + usage: { inputTokens: 100, outputTokens: 10 } + }; + + mockClient.chat = vi.fn().mockResolvedValue(mockResponse).mockImplementationOnce(async (r) => { + return mockResponse; + }); + + const tool = createImageAnalyzeTool(mockClient); + await tool.execute({ + url: 'https://example.com/image.jpg', + prompt: 'Analyze the colors.' + }); + + const callArgs = (mockClient.chat as any).mock.calls[0][0]; + expect(callArgs.system).toContain('vision assistant'); + expect(callArgs.system).toContain('Analyze the provided image'); + }); + + it('respects maxTokens parameter', async () => { + const mockRequest = { + messages: [] as any, + system: '', + maxTokens: 1024 + }; + const mockResponse = { + content: 'Short response', + stopReason: 'end_turn', + usage: { inputTokens: 10, outputTokens: 10 } + }; + + mockClient.chat = vi.fn().mockResolvedValueOnce(mockResponse); + + const tool = createImageAnalyzeTool(mockClient); + await tool.execute({ url: 'https://example.com/image.jpg' }); + + const callArgs = (mockClient.chat as any).mock.calls[0][0]; + expect(callArgs.maxTokens).toBe(1024); + }); + + it('passes through model response content', async () => { + const expectedContent = 'Detailed analysis of the image...'; + mockClient.chat.mockResolvedValueOnce({ + content: expectedContent, + stopReason: 'end_turn', + usage: { inputTokens: 100, outputTokens: 100 } + }); + + const tool = createImageAnalyzeTool(mockClient); + const result = await tool.execute({ url: 'https://example.com/image.jpg' }); + + expect(result.success).toBe(true); + expect(result.output).toBe(expectedContent); + }); +}); diff --git a/src/tools/builtin/image-analyze.ts b/src/tools/builtin/image-analyze.ts new file mode 100644 index 0000000..09d8361 --- /dev/null +++ b/src/tools/builtin/image-analyze.ts @@ -0,0 +1,121 @@ +import type { ModelClient } from '../../models/types.js'; +import type { Tool, ToolResult } from '../types.js'; + +interface ImageAnalyzeArgs { + url?: string; + data?: string; + media_type?: string; + prompt?: string; +} + +const DEFAULT_PROMPT = 'Describe this image in detail.'; + +const VALID_MEDIA_TYPES = ['image/jpeg', 'image/png', 'image/gif', 'image/webp']; + +export function createImageAnalyzeTool(modelClient: ModelClient): Tool { + return { + name: 'image.analyze', + description: 'Analyze an image using a vision model. Accepts a URL or base64-encoded image data. Returns a text description or analysis based on the provided prompt.', + inputSchema: { + type: 'object', + properties: { + url: { + type: 'string', + description: 'URL of the image to analyze' + }, + data: { + type: 'string', + description: 'Base64-encoded image data (alternative to url)' + }, + media_type: { + type: 'string', + description: + 'MIME type of the image (required when using data). One of: image/jpeg, image/png, image/gif, image/webp' + }, + prompt: { + type: 'string', + description: + 'What to analyze or describe about the image. Default: "Describe this image in detail."' + } + }, + required: [] + }, + execute: async (rawArgs: unknown): Promise => { + try { + const args = rawArgs as ImageAnalyzeArgs; + + if (!args.url && !args.data) { + return { + success: false, + output: '', + error: 'Either "url" or "data" must be provided' + }; + } + + if (args.url && args.data) { + return { + success: false, + output: '', + error: 'Cannot provide both "url" and "data" - choose one' + }; + } + + if (args.data && !args.media_type) { + return { + success: false, + output: '', + error: 'media_type is required when providing data' + }; + } + + if (args.media_type && !VALID_MEDIA_TYPES.includes(args.media_type)) { + return { + success: false, + output: '', + error: `Invalid media_type: ${args.media_type}. Must be one of: ${VALID_MEDIA_TYPES.join(', ')}` + }; + } + + const prompt = args.prompt || DEFAULT_PROMPT; + + const imageSource = args.url + ? { + type: 'url' as const, + media_type: args.media_type || 'image/jpeg', + url: args.url + } + : { + type: 'base64' as const, + media_type: args.media_type!, + data: args.data + }; + + const message = { + role: 'user' as const, + content: [ + { type: 'text' as const, text: prompt }, + { type: 'image' as const, source: imageSource } + ] + }; + + const response = await modelClient.chat({ + messages: [message], + system: + 'You are a vision assistant. Analyze the provided image according to the user\'s request. Provide detailed, helpful descriptions.', + maxTokens: 1024 + }); + + return { + success: true, + output: response.content + }; + } catch (error) { + return { + success: false, + output: '', + error: error instanceof Error ? error.message : String(error) + }; + } + } + }; +}