feat: add image.analyze tool for vision model analysis

Provides a factory createImageAnalyzeTool(modelClient) that sends images to a vision-capable model and returns a textual analysis. Includes 15 tests covering base64, URL, multi-image, error, and edge cases.
2026-02-07 09:08:53 -08:00
parent d4530a7034
commit 1e6f6bb5a4
2 changed files with 427 additions and 0 deletions
@@ -0,0 +1,306 @@
+import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import type { ModelClient, ChatRequest, ChatResponse } from '../../models/types.js';
+import { createImageAnalyzeTool } from './image-analyze.js';
+
+describe('image.analyze tool', () => {
+  let mockClient: ModelClient & { chat: ReturnType<typeof vi.fn> };
+
+  beforeEach(() => {
+    mockClient = {
+      chat: vi.fn()
+    };
+  });
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('has correct metadata', () => {
+    const tool = createImageAnalyzeTool(mockClient);
+    expect(tool.name).toBe('image.analyze');
+    expect(tool.inputSchema.required).toHaveLength(0);
+  });
+
+  it('analyzes image from URL', async () => {
+    mockClient.chat = vi.fn().mockResolvedValueOnce({
+      content: 'This is a beautiful sunset over the ocean.',
+      stopReason: 'end_turn',
+      usage: { inputTokens: 100, outputTokens: 50 }
+    });
+
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({ url: 'https://example.com/image.jpg' });
+
+    expect(result.success).toBe(true);
+    expect(result.output).toBe('This is a beautiful sunset over the ocean.');
+    expect(mockClient.chat).toHaveBeenCalledTimes(1);
+    expect(mockClient.chat).toHaveBeenCalledWith(
+      expect.objectContaining({
+        messages: expect.arrayContaining([
+          expect.objectContaining({
+            role: 'user',
+            content: expect.arrayContaining([
+              { type: 'text', text: 'Describe this image in detail.' },
+              {
+                type: 'image',
+                source: expect.objectContaining({
+                  type: 'url',
+                  media_type: 'image/jpeg',
+                  url: 'https://example.com/image.jpg'
+                })
+              }
+            ])
+          })
+        ]),
+        system: expect.stringContaining('vision assistant'),
+        maxTokens: 1024
+      })
+    );
+  });
+
+  it('analyzes image from base64 data', async () => {
+    const base64Data = 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==';
+    mockClient.chat = vi.fn().mockResolvedValueOnce({
+      content: 'This is a sample image.',
+      stopReason: 'end_turn',
+      usage: { inputTokens: 100, outputTokens: 20 }
+    });
+
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({
+      data: base64Data,
+      media_type: 'image/png'
+    });
+
+    expect(result.success).toBe(true);
+    expect(result.output).toBe('This is a sample image.');
+    expect(mockClient.chat).toHaveBeenCalledTimes(1);
+    expect(mockClient.chat).toHaveBeenCalledWith(
+      expect.objectContaining({
+        messages: expect.arrayContaining([
+          expect.objectContaining({
+            role: 'user',
+            content: expect.arrayContaining([
+              { type: 'text', text: 'Describe this image in detail.' },
+              {
+                type: 'image',
+                source: expect.objectContaining({
+                  type: 'base64',
+                  media_type: 'image/png',
+                  data: base64Data
+                })
+              }
+            ])
+          })
+        ]),
+        system: expect.stringContaining('vision assistant'),
+        maxTokens: 1024
+      })
+    );
+  });
+
+  it('uses custom prompt', async () => {
+    mockClient.chat = vi.fn().mockResolvedValueOnce({
+      content: 'The image shows a cat sitting on a mat.',
+      stopReason: 'end_turn',
+      usage: { inputTokens: 100, outputTokens: 30 }
+    });
+
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({
+      url: 'https://example.com/cat.jpg',
+      prompt: 'What is in this image?'
+    });
+
+    expect(result.success).toBe(true);
+    expect(result.output).toBe('The image shows a cat sitting on a mat.');
+    expect(mockClient.chat).toHaveBeenCalledWith(
+      expect.objectContaining({
+        messages: expect.arrayContaining([
+          expect.objectContaining({
+            content: expect.arrayContaining([
+              { type: 'text', text: 'What is in this image?' },
+              expect.any(Object)
+            ])
+          })
+        ])
+      })
+    );
+  });
+
+  it('defaults prompt to "Describe this image in detail."', async () => {
+    mockClient.chat = vi.fn().mockResolvedValueOnce({
+      content: 'This is the default prompt response.',
+      stopReason: 'end_turn',
+      usage: { inputTokens: 100, outputTokens: 10 }
+    });
+
+    const tool = createImageAnalyzeTool(mockClient);
+    await tool.execute({ url: 'https://example.com/image.jpg' });
+
+    expect(mockClient.chat).toHaveBeenCalledWith(
+      expect.objectContaining({
+        messages: expect.arrayContaining([
+          expect.objectContaining({
+            content: expect.arrayContaining([
+              { type: 'text', text: 'Describe this image in detail.' },
+              expect.any(Object)
+            ])
+          })
+        ])
+      })
+    );
+  });
+
+  it('fails when neither url nor data is provided', async () => {
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({});
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('Either "url" or "data" must be provided');
+    expect(mockClient.chat).not.toHaveBeenCalled();
+  });
+
+  it('fails when both url and data are provided', async () => {
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({
+      url: 'https://example.com/image.jpg',
+      data: 'base64data'
+    });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('Cannot provide both "url" and "data"');
+    expect(mockClient.chat).not.toHaveBeenCalled();
+  });
+
+  it('fails when data is provided without media_type', async () => {
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({
+      data: 'base64data',
+      prompt: 'Test'
+    });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('media_type is required when providing data');
+    expect(mockClient.chat).not.toHaveBeenCalled();
+  });
+
+  it('fails with invalid media_type', async () => {
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({
+      data: 'base64data',
+      media_type: 'image/tiff'
+    });
+
+    expect(result.success).toBe(false);
+    expect(result.error).toContain('Invalid media_type');
+    expect(mockClient.chat).not.toHaveBeenCalled();
+  });
+
+  it('passes valid media_types', async () => {
+    const validTypes = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'];
+
+    for (const mediaType of validTypes) {
+      mockClient.chat = vi.fn().mockResolvedValueOnce({
+        content: 'Success',
+        stopReason: 'end_turn',
+        usage: { inputTokens: 10, outputTokens: 10 }
+      });
+
+      const tool = createImageAnalyzeTool(mockClient);
+      const result = await tool.execute({
+        data: 'base64data',
+        media_type: mediaType
+      });
+
+      expect(result.success).toBe(true);
+      expect(mockClient.chat).toHaveBeenCalledTimes(1);
+      mockClient.chat = vi.fn();
+    }
+  });
+
+  it('handles model client errors', async () => {
+    mockClient.chat = vi.fn().mockRejectedValueOnce(new Error('Model API error'));
+
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({ url: 'https://example.com/image.jpg' });
+
+    expect(result.success).toBe(false);
+    expect(result.output).toBe('');
+    expect(result.error).toBe('Model API error');
+  });
+
+  it('handles non-Error exceptions', async () => {
+    mockClient.chat = vi.fn().mockRejectedValueOnce('String error');
+
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({ url: 'https://example.com/image.jpg' });
+
+    expect(result.success).toBe(false);
+    expect(result.output).toBe('');
+    expect(result.error).toBe('String error');
+  });
+
+  it('uses custom system message prompt', async () => {
+    const mockRequest = {
+      messages: [] as any,
+      system: '',
+      maxTokens: 1024
+    };
+    const mockResponse = {
+      content: 'Analysis complete.',
+      stopReason: 'end_turn',
+      usage: { inputTokens: 100, outputTokens: 10 }
+    };
+
+    mockClient.chat = vi.fn().mockResolvedValue(mockResponse).mockImplementationOnce(async (r) => {
+      return mockResponse;
+    });
+
+    const tool = createImageAnalyzeTool(mockClient);
+    await tool.execute({
+      url: 'https://example.com/image.jpg',
+      prompt: 'Analyze the colors.'
+    });
+
+    const callArgs = (mockClient.chat as any).mock.calls[0][0];
+    expect(callArgs.system).toContain('vision assistant');
+    expect(callArgs.system).toContain('Analyze the provided image');
+  });
+
+  it('respects maxTokens parameter', async () => {
+    const mockRequest = {
+      messages: [] as any,
+      system: '',
+      maxTokens: 1024
+    };
+    const mockResponse = {
+      content: 'Short response',
+      stopReason: 'end_turn',
+      usage: { inputTokens: 10, outputTokens: 10 }
+    };
+
+    mockClient.chat = vi.fn().mockResolvedValueOnce(mockResponse);
+
+    const tool = createImageAnalyzeTool(mockClient);
+    await tool.execute({ url: 'https://example.com/image.jpg' });
+
+    const callArgs = (mockClient.chat as any).mock.calls[0][0];
+    expect(callArgs.maxTokens).toBe(1024);
+  });
+
+  it('passes through model response content', async () => {
+    const expectedContent = 'Detailed analysis of the image...';
+    mockClient.chat.mockResolvedValueOnce({
+      content: expectedContent,
+      stopReason: 'end_turn',
+      usage: { inputTokens: 100, outputTokens: 100 }
+    });
+
+    const tool = createImageAnalyzeTool(mockClient);
+    const result = await tool.execute({ url: 'https://example.com/image.jpg' });
+
+    expect(result.success).toBe(true);
+    expect(result.output).toBe(expectedContent);
+  });
+});
@@ -0,0 +1,121 @@
+import type { ModelClient } from '../../models/types.js';
+import type { Tool, ToolResult } from '../types.js';
+
+interface ImageAnalyzeArgs {
+  url?: string;
+  data?: string;
+  media_type?: string;
+  prompt?: string;
+}
+
+const DEFAULT_PROMPT = 'Describe this image in detail.';
+
+const VALID_MEDIA_TYPES = ['image/jpeg', 'image/png', 'image/gif', 'image/webp'];
+
+export function createImageAnalyzeTool(modelClient: ModelClient): Tool {
+  return {
+    name: 'image.analyze',
+    description: 'Analyze an image using a vision model. Accepts a URL or base64-encoded image data. Returns a text description or analysis based on the provided prompt.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        url: {
+          type: 'string',
+          description: 'URL of the image to analyze'
+        },
+        data: {
+          type: 'string',
+          description: 'Base64-encoded image data (alternative to url)'
+        },
+        media_type: {
+          type: 'string',
+          description:
+            'MIME type of the image (required when using data). One of: image/jpeg, image/png, image/gif, image/webp'
+        },
+        prompt: {
+          type: 'string',
+          description:
+            'What to analyze or describe about the image. Default: "Describe this image in detail."'
+        }
+      },
+      required: []
+    },
+    execute: async (rawArgs: unknown): Promise<ToolResult> => {
+      try {
+        const args = rawArgs as ImageAnalyzeArgs;
+
+        if (!args.url && !args.data) {
+          return {
+            success: false,
+            output: '',
+            error: 'Either "url" or "data" must be provided'
+          };
+        }
+
+        if (args.url && args.data) {
+          return {
+            success: false,
+            output: '',
+            error: 'Cannot provide both "url" and "data" - choose one'
+          };
+        }
+
+        if (args.data && !args.media_type) {
+          return {
+            success: false,
+            output: '',
+            error: 'media_type is required when providing data'
+          };
+        }
+
+        if (args.media_type && !VALID_MEDIA_TYPES.includes(args.media_type)) {
+          return {
+            success: false,
+            output: '',
+            error: `Invalid media_type: ${args.media_type}. Must be one of: ${VALID_MEDIA_TYPES.join(', ')}`
+          };
+        }
+
+        const prompt = args.prompt || DEFAULT_PROMPT;
+
+        const imageSource = args.url
+          ? {
+              type: 'url' as const,
+              media_type: args.media_type || 'image/jpeg',
+              url: args.url
+            }
+          : {
+              type: 'base64' as const,
+              media_type: args.media_type!,
+              data: args.data
+            };
+
+        const message = {
+          role: 'user' as const,
+          content: [
+            { type: 'text' as const, text: prompt },
+            { type: 'image' as const, source: imageSource }
+          ]
+        };
+
+        const response = await modelClient.chat({
+          messages: [message],
+          system:
+            'You are a vision assistant. Analyze the provided image according to the user\'s request. Provide detailed, helpful descriptions.',
+          maxTokens: 1024
+        });
+
+        return {
+          success: true,
+          output: response.content
+        };
+      } catch (error) {
+        return {
+          success: false,
+          output: '',
+          error: error instanceof Error ? error.message : String(error)
+        };
+      }
+    }
+  };
+}